diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
new file mode 100644
index 000000000..8383669fc
--- /dev/null
+++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
@@ -0,0 +1,375 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/cuda_stream_ref.hpp>
+
+#include <thrust/device_malloc_allocator.h>
+#include <thrust/device_vector.h>
+
+#include <cuda/std/array>
+
+#include <climits>
+#include <cstddef>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/**
+ * @brief Struct to store ranks of bits at 256-bit intervals (or blocks)
+ *
+ * This struct encodes a list of four rank values using base + offset format
+ * e.g. [1000, 1005, 1006, 1009] is stored as base = 1000, offsets = [5, 6, 9]
+ * base uses 40 bits, split between one uint32_t and one uint8_t
+ * each offset uses 8 bits
+ */
+struct rank {
+  uint32_t base_hi_;                      ///< Upper 32 bits of base
+  uint8_t base_lo_;                       ///< Lower 8 bits of base
+  cuda::std::array<uint8_t, 3> offsets_;  ///< Offsets for 64-bit sub-intervals, relative to base
+
+  /**
+   * @brief Gets base rank of current 256-bit interval
+   *
+   * @return The base rank
+   */
+  __host__ __device__ constexpr uint64_t base() const noexcept
+  {
+    return (static_cast<uint64_t>(base_hi_) << CHAR_BIT) | base_lo_;
+  }
+
+  /**
+   * @brief Sets base rank of current 256-bit interval
+   *
+   * @param base Base rank
+   */
+  __host__ __device__ constexpr void set_base(uint64_t base) noexcept
+  {
+    base_hi_ = static_cast<uint32_t>(base >> CHAR_BIT);
+    base_lo_ = static_cast<uint8_t>(base);
+  }
+};
+
+/**
+ * @brief Bitset class with rank and select index structures
+ *
+ * In addition to standard bitset set/test operations, this class provides
+ * rank and select operation API. It maintains index structures to make both these
+ * new operations close to constant time.
+ *
+ * Current limitations:
+ * - Stream controls are partially supported due to the use of `thrust::device_vector` as storage
+ * - Device ref doesn't support modifiers like `set`, `reset`, etc.
+ *
+ * @tparam Allocator Type of allocator used for device storage
+ */
+// TODO: have to use device_malloc_allocator for now otherwise the container cannot grow
+template <class Allocator = thrust::device_malloc_allocator<std::byte>>
+class dynamic_bitset {
+ public:
+  using size_type = std::size_t;  ///< size type to specify bit index
+  using word_type = uint64_t;     ///< word type
+  /// Type of the allocator to (de)allocate words
+  using allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<word_type>;
+
+  /// Number of bits per block. Note this is a tradeoff between space efficiency and perf.
+  static constexpr size_type words_per_block = 4;
+  /// Number of bits in a word
+  static constexpr size_type bits_per_word = sizeof(word_type) * CHAR_BIT;
+  /// Number of bits in a block
+  static constexpr size_type bits_per_block = words_per_block * bits_per_word;
+
+  /**
+   * @brief Constructs an empty bitset
+   *
+   * @param allocator Allocator used for allocating device storage
+   */
+  constexpr dynamic_bitset(Allocator const& allocator = Allocator{});
+
+  /**
+   * @brief Appends the given element `value` to the end of the bitset
+   *
+   * This API may involve data reallocation if the current storage is exhausted.
+   *
+   * @param value Boolean value of the new bit to be added
+   */
+  constexpr void push_back(bool value) noexcept;
+
+  /**
+   * @brief Sets the target bit indexed by `index` to a specified `value`.
+   *
+   * @param index Position of bit to be modified
+   * @param value New value of the target bit
+   */
+  constexpr void set(size_type index, bool value) noexcept;
+
+  /**
+   * @brief Sets the last bit to a specified value
+   *
+   * @param value New value of the last bit
+   */
+  constexpr void set_last(bool value) noexcept;
+
+  /**
+   * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the
+   * boolean value at position `keys_begin[i]` to `output_begin[i]`.
+   *
+   * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+   * `size_type`
+   * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean
+   * type
+   *
+   * @param keys_begin Begin iterator to keys list whose values are queried
+   * @param keys_end End iterator to keys list
+   * @param outputs_begin Begin iterator to outputs of test operation
+   * @param stream Stream to execute test kernel
+   */
+  template <typename KeyIt, typename OutputIt>
+  constexpr void test(KeyIt keys_begin,
+                      KeyIt keys_end,
+                      OutputIt outputs_begin,
+                      cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores total
+   * count of `1` bits preceeding (but not including) position `keys_begin[i]` to `output_begin[i]`.
+   *
+   * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+   * `size_type`
+   * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+   * `size_type`
+   *
+   * @param keys_begin Begin iterator to keys list whose ranks are queried
+   * @param keys_end End iterator to keys list
+   * @param outputs_begin Begin iterator to outputs ranks list
+   * @param stream Stream to execute ranks kernel
+   */
+  template <typename KeyIt, typename OutputIt>
+  constexpr void rank(KeyIt keys_begin,
+                      KeyIt keys_end,
+                      OutputIt outputs_begin,
+                      cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief For any element `keys_begin[i]` in the range `[keys_begin, keys_end)`, stores the
+   * position of `keys_begin[i]`th `1` bit to `output_begin[i]`.
+   *
+   * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+   * `size_type`
+   * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+   * `size_type`
+   *
+   * @param keys_begin Begin iterator to keys list whose select values are queried
+   * @param keys_end End iterator to keys list
+   * @param outputs_begin Begin iterator to outputs selects list
+   * @param stream Stream to execute selects kernel
+   */
+  template <typename KeyIt, typename OutputIt>
+  constexpr void select(KeyIt keys_begin,
+                        KeyIt keys_end,
+                        OutputIt outputs_begin,
+                        cuda_stream_ref stream = {}) noexcept;
+
+  using rank_type = cuco::experimental::detail::rank;  ///< Rank type
+
+  /**
+   *@brief Struct to hold all storage refs needed by reference
+   */
+  // TODO: this is not a real ref type, to be changed
+  struct storage_ref_type {
+    const word_type* words_ref_;  ///< Words ref
+
+    const rank_type* ranks_true_ref_;    ///< Ranks ref for 1 bits
+    const size_type* selects_true_ref_;  ///< Selects ref for 1 bits
+
+    const rank_type* ranks_false_ref_;    ///< Ranks ref for 0 bits
+    const size_type* selects_false_ref_;  ///< Selects ref 0 bits
+  };
+
+  /**
+   * @brief Device non-owning reference type of dynamic_bitset
+   */
+  class reference {
+   public:
+    /**
+     * @brief Constructs a reference
+     *
+     * @param storage Struct with non-owning refs to bitset storage arrays
+     */
+    __host__ __device__ explicit constexpr reference(storage_ref_type storage) noexcept;
+
+    /**
+     * @brief Access value of a single bit
+     *
+     * @param key Position of bit
+     *
+     * @return Value of bit at position specified by key
+     */
+    [[nodiscard]] __device__ constexpr bool test(size_type key) const noexcept;
+
+    /**
+     * @brief Access a single word of internal storage
+     *
+     * @param word_id Index of word
+     *
+     * @return Word at position specified by index
+     */
+    [[nodiscard]] __device__ constexpr word_type word(size_type word_id) const noexcept;
+
+    /**
+     * @brief Find position of first set bit starting from a given position (inclusive)
+     *
+     * @param key Position of starting bit
+     *
+     * @return Index of next set bit
+     */
+    [[nodiscard]] __device__ size_type find_next(size_type key) const noexcept;
+
+    /**
+     * @brief Find number of set bits (rank) in all positions before the input position (exclusive)
+     *
+     * @param key Input bit position
+     *
+     * @return Rank of input position
+     */
+    [[nodiscard]] __device__ constexpr size_type rank(size_type key) const noexcept;
+
+    /**
+     * @brief Find position of Nth set (1) bit counting from start
+     *
+     * @param count Input N
+     *
+     * @return Position of Nth set bit
+     */
+    [[nodiscard]] __device__ constexpr size_type select(size_type count) const noexcept;
+
+    /**
+     * @brief Find position of Nth not-set (0) bit counting from start
+     *
+     * @param count Input N
+     *
+     * @return Position of Nth not-set bit
+     */
+    [[nodiscard]] __device__ constexpr size_type select_false(size_type count) const noexcept;
+
+   private:
+    /**
+     * @brief Helper function for select operation that computes an initial rank estimate
+     *
+     * @param count Input count for which select operation is being performed
+     * @param selects Selects array
+     * @param ranks Ranks array
+     *
+     * @return index in ranks which corresponds to highest rank less than count (least upper bound)
+     */
+    template <typename SelectsRef, typename RanksRef>
+    [[nodiscard]] __device__ constexpr size_type initial_rank_estimate(
+      size_type count, const SelectsRef& selects, const RanksRef& ranks) const noexcept;
+
+    /**
+     * @brief Subtract rank estimate from input count and return an increment to word_id
+     *
+     * @tparam Rank type
+     *
+     * @param count Input count that will be updated
+     * @param rank  Initial rank estimate for count
+     *
+     * @return Increment to word_id based on rank values
+     */
+    template <typename Rank>
+    [[nodiscard]] __device__ constexpr size_type subtract_rank_from_count(size_type& count,
+                                                                          Rank rank) const noexcept;
+
+    /**
+     * @brief Find position of Nth set bit in a 64-bit word
+     *
+     * @param N Input count
+     *
+     * @return Position of Nth set bit
+     */
+    [[nodiscard]] __device__ size_type select_bit_in_word(size_type N,
+                                                          word_type word) const noexcept;
+
+    storage_ref_type storage_;  ///< Non-owning storage
+  };
+
+  using ref_type = reference;  ///< Non-owning container ref type
+
+  /**
+   * @brief Gets non-owning device ref of the current object
+   *
+   * @return Device ref of the current `dynamic_bitset` object
+   */
+  [[nodiscard]] constexpr ref_type ref() const noexcept;
+
+  /**
+   * @brief Gets the number of bits dynamic_bitset holds
+   *
+   * @return Number of bits dynamic_bitset holds
+   */
+  [[nodiscard]] constexpr size_type size() const noexcept;
+
+ private:
+  /// Type of the allocator to (de)allocate ranks
+  using rank_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<rank_type>;
+  /// Type of the allocator to (de)allocate indices
+  using size_allocator_type = typename std::allocator_traits<Allocator>::rebind_alloc<size_type>;
+
+  allocator_type allocator_;  ///< Words allocator
+  size_type n_bits_;          ///< Number of bits dynamic_bitset currently holds
+  bool is_built_;  ///< Flag indicating whether the rank and select indices are built or not
+
+  /// Words vector that represents all bits
+  thrust::device_vector<word_type, allocator_type> words_;
+  /// Rank values for every 256-th bit (4-th word)
+  thrust::device_vector<rank_type, rank_allocator_type> ranks_true_;
+  /// Same as ranks_ but for `0` bits
+  thrust::device_vector<rank_type, rank_allocator_type> ranks_false_;
+  /// Block indices of (0, 256, 512...)th `1` bit
+  thrust::device_vector<size_type, size_allocator_type> selects_true_;
+  /// Same as selects_, but for `0` bits
+  thrust::device_vector<size_type, size_allocator_type> selects_false_;
+
+  /**
+   * @brief Builds indexes for rank and select
+   *
+   * @param stream Stream to execute kernels
+   */
+  constexpr void build(cuda_stream_ref stream = {}) noexcept;
+
+  /**
+   * @brief Populates rank and select indexes for true or false bits
+   *
+   * @param ranks Output array of ranks
+   * @param selects Output array of selects
+   * @param flip_bits If true, negate bits to construct indexes for false bits
+   * @param stream Stream to execute kernels
+   */
+  constexpr void build_ranks_and_selects(
+    thrust::device_vector<rank_type, rank_allocator_type>& ranks,
+    thrust::device_vector<size_type, size_allocator_type>& selects,
+    bool flip_bits,
+    cuda_stream_ref stream = {});
+};
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl>
diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
new file mode 100644
index 000000000..d56ef9d7c
--- /dev/null
+++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.inl
@@ -0,0 +1,404 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/detail/trie/dynamic_bitset/kernels.cuh>
+#include <cuco/detail/utility/cuda.hpp>
+#include <cuco/detail/utils.hpp>
+
+#include <thrust/device_vector.h>
+#include <thrust/iterator/discard_iterator.h>
+
+#include <cub/device/device_scan.cuh>
+#include <cub/device/device_select.cuh>
+
+#include <cuda/std/bit>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+template <class Allocator>
+constexpr dynamic_bitset<Allocator>::dynamic_bitset(Allocator const& allocator)
+  : allocator_{allocator},
+    n_bits_{0},
+    is_built_{false},
+    words_{allocator},
+    ranks_true_{allocator},
+    ranks_false_{allocator},
+    selects_true_{allocator},
+    selects_false_{allocator}
+{
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::push_back(bool bit) noexcept
+{
+  if (n_bits_ % bits_per_block == 0) {
+    words_.resize(words_.size() + words_per_block);  // Extend storage by one block
+  }
+
+  set(n_bits_++, bit);
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::set(size_type index, bool bit) noexcept
+{
+  is_built_         = false;
+  size_type word_id = index / bits_per_word;
+  size_type bit_id  = index % bits_per_word;
+  if (bit) {
+    words_[word_id] |= 1UL << bit_id;
+  } else {
+    words_[word_id] &= ~(1UL << bit_id);
+  }
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::set_last(bool bit) noexcept
+{
+  set(n_bits_ - 1, bit);
+}
+
+template <class Allocator>
+template <typename KeyIt, typename OutputIt>
+constexpr void dynamic_bitset<Allocator>::test(KeyIt keys_begin,
+                                               KeyIt keys_end,
+                                               OutputIt outputs_begin,
+                                               cuda_stream_ref stream) noexcept
+
+{
+  build();
+  auto const num_keys = cuco::detail::distance(keys_begin, keys_end);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num_keys);
+
+  bitset_test_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    ref(), keys_begin, outputs_begin, num_keys);
+}
+
+template <class Allocator>
+template <typename KeyIt, typename OutputIt>
+constexpr void dynamic_bitset<Allocator>::rank(KeyIt keys_begin,
+                                               KeyIt keys_end,
+                                               OutputIt outputs_begin,
+                                               cuda_stream_ref stream) noexcept
+{
+  build();
+  auto const num_keys = cuco::detail::distance(keys_begin, keys_end);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num_keys);
+
+  bitset_rank_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    ref(), keys_begin, outputs_begin, num_keys);
+}
+
+template <class Allocator>
+template <typename KeyIt, typename OutputIt>
+constexpr void dynamic_bitset<Allocator>::select(KeyIt keys_begin,
+                                                 KeyIt keys_end,
+                                                 OutputIt outputs_begin,
+                                                 cuda_stream_ref stream) noexcept
+
+{
+  build();
+  auto const num_keys = cuco::detail::distance(keys_begin, keys_end);
+  if (num_keys == 0) { return; }
+
+  auto const grid_size = cuco::detail::grid_size(num_keys);
+
+  bitset_select_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    ref(), keys_begin, outputs_begin, num_keys);
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::build_ranks_and_selects(
+  thrust::device_vector<rank_type, rank_allocator_type>& ranks,
+  thrust::device_vector<size_type, size_allocator_type>& selects,
+  bool flip_bits,
+  cuda_stream_ref stream)
+{
+  if (n_bits_ == 0) { return; }
+
+  // Step 1. Compute prefix sum of per-word bit counts
+  // Population counts for each word
+  size_type const num_words = words_.size();
+  // Sized to have one extra entry for subsequent prefix sum
+  auto const bit_counts_size = num_words + 1;
+
+  thrust::device_vector<size_type, size_allocator_type> bit_counts(num_words + 1, this->allocator_);
+  auto const bit_counts_begin = thrust::raw_pointer_cast(bit_counts.data());
+
+  auto grid_size = cuco::detail::grid_size(num_words);
+  bit_counts_kernel<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    thrust::raw_pointer_cast(words_.data()), bit_counts_begin, num_words, flip_bits);
+
+  std::size_t temp_storage_bytes = 0;
+  using temp_allocator_type = typename std::allocator_traits<allocator_type>::rebind_alloc<char>;
+  auto temp_allocator       = temp_allocator_type{this->allocator_};
+
+  CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(
+    nullptr, temp_storage_bytes, bit_counts_begin, bit_counts_begin, bit_counts_size, stream));
+
+  // Allocate temporary storage
+  auto d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
+
+  CUCO_CUDA_TRY(cub::DeviceScan::ExclusiveSum(thrust::raw_pointer_cast(d_temp_storage),
+                                              temp_storage_bytes,
+                                              bit_counts_begin,
+                                              bit_counts_begin,
+                                              bit_counts_size,
+                                              stream));
+
+  temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
+
+  // Step 2. Compute ranks
+  auto const num_blocks = (num_words - 1) / words_per_block + 2;
+  ranks.resize(num_blocks);
+
+  grid_size = cuco::detail::grid_size(num_blocks);
+  encode_ranks_from_prefix_bit_counts<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    bit_counts_begin,
+    thrust::raw_pointer_cast(ranks.data()),
+    num_words,
+    num_blocks,
+    words_per_block);
+
+  // Step 3. Compute selects
+  thrust::device_vector<size_type, size_allocator_type> select_markers(num_blocks,
+                                                                       this->allocator_);
+  auto const select_markers_begin = thrust::raw_pointer_cast(select_markers.data());
+
+  mark_blocks_with_select_entries<<<grid_size, cuco::detail::default_block_size(), 0, stream>>>(
+    bit_counts_begin, select_markers_begin, num_blocks, words_per_block, bits_per_block);
+
+  auto d_sum = reinterpret_cast<size_type*>(thrust::raw_pointer_cast(
+    std::allocator_traits<temp_allocator_type>::allocate(temp_allocator, sizeof(size_type))));
+  CUCO_CUDA_TRY(cub::DeviceReduce::Sum(
+    nullptr, temp_storage_bytes, select_markers_begin, d_sum, num_blocks, stream));
+
+  d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
+
+  CUCO_CUDA_TRY(cub::DeviceReduce::Sum(thrust::raw_pointer_cast(d_temp_storage),
+                                       temp_storage_bytes,
+                                       select_markers_begin,
+                                       d_sum,
+                                       num_blocks,
+                                       stream));
+
+  size_type num_selects{};
+  CUCO_CUDA_TRY(
+    cudaMemcpyAsync(&num_selects, d_sum, sizeof(size_type), cudaMemcpyDeviceToHost, stream));
+  stream.synchronize();
+  std::allocator_traits<temp_allocator_type>::deallocate(
+    temp_allocator, thrust::device_ptr<char>{reinterpret_cast<char*>(d_sum)}, sizeof(size_type));
+  temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
+
+  selects.resize(num_selects);
+
+  auto const select_begin = thrust::raw_pointer_cast(selects.data());
+
+  CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(nullptr,
+                                           temp_storage_bytes,
+                                           thrust::make_counting_iterator(0UL),
+                                           select_markers_begin,
+                                           select_begin,
+                                           thrust::make_discard_iterator(),
+                                           num_blocks,
+                                           stream));
+
+  d_temp_storage = temp_allocator.allocate(temp_storage_bytes);
+
+  CUCO_CUDA_TRY(cub::DeviceSelect::Flagged(thrust::raw_pointer_cast(d_temp_storage),
+                                           temp_storage_bytes,
+                                           thrust::make_counting_iterator(0UL),
+                                           select_markers_begin,
+                                           select_begin,
+                                           thrust::make_discard_iterator(),
+                                           num_blocks,
+                                           stream));
+
+  temp_allocator.deallocate(d_temp_storage, temp_storage_bytes);
+}
+
+template <class Allocator>
+constexpr void dynamic_bitset<Allocator>::build(cuda_stream_ref stream) noexcept
+{
+  if (not is_built_) {
+    build_ranks_and_selects(ranks_true_, selects_true_, false, stream);   // 1 bits
+    build_ranks_and_selects(ranks_false_, selects_false_, true, stream);  // 0 bits
+    is_built_ = true;
+  }
+}
+
+template <class Allocator>
+constexpr dynamic_bitset<Allocator>::ref_type dynamic_bitset<Allocator>::ref() const noexcept
+{
+  return ref_type{storage_ref_type{thrust::raw_pointer_cast(words_.data()),
+                                   thrust::raw_pointer_cast(ranks_true_.data()),
+                                   thrust::raw_pointer_cast(selects_true_.data()),
+                                   thrust::raw_pointer_cast(ranks_false_.data()),
+                                   thrust::raw_pointer_cast(selects_false_.data())}};
+}
+
+template <class Allocator>
+constexpr dynamic_bitset<Allocator>::size_type dynamic_bitset<Allocator>::size() const noexcept
+{
+  return n_bits_;
+}
+
+// Device reference implementations
+
+template <class Allocator>
+__host__ __device__ constexpr dynamic_bitset<Allocator>::reference::reference(
+  storage_ref_type storage) noexcept
+  : storage_{storage}
+{
+}
+
+template <class Allocator>
+__device__ constexpr bool dynamic_bitset<Allocator>::reference::test(size_type key) const noexcept
+{
+  return (storage_.words_ref_[key / bits_per_word] >> (key % bits_per_word)) & 1UL;
+}
+
+template <class Allocator>
+__device__ constexpr typename dynamic_bitset<Allocator>::word_type
+dynamic_bitset<Allocator>::reference::word(size_type word_id) const noexcept
+{
+  return storage_.words_ref_[word_id];
+}
+
+template <class Allocator>
+__device__ typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::find_next(size_type key) const noexcept
+{
+  size_type word_id = key / bits_per_word;
+  size_type bit_id  = key % bits_per_word;
+  word_type word    = storage_.words_ref_[word_id];
+  word &= ~(0UL) << bit_id;
+  while (word == 0) {
+    word = storage_.words_ref_[++word_id];
+  }
+  return word_id * bits_per_word + __ffsll(word) - 1;  // cuda intrinsic
+}
+
+template <class Allocator>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::rank(size_type key) const noexcept
+{
+  size_type word_id   = key / bits_per_word;
+  size_type bit_id    = key % bits_per_word;
+  size_type rank_id   = word_id / words_per_block;
+  size_type offset_id = word_id % words_per_block;
+
+  auto rank   = storage_.ranks_true_ref_[rank_id];
+  size_type n = rank.base();
+
+  if (offset_id != 0) { n += rank.offsets_[offset_id - 1]; }
+
+  n += cuda::std::popcount(storage_.words_ref_[word_id] & ((1UL << bit_id) - 1));
+
+  return n;
+}
+
+template <class Allocator>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::select(size_type count) const noexcept
+{
+  auto rank_id = initial_rank_estimate(count, storage_.selects_true_ref_, storage_.ranks_true_ref_);
+  auto rank    = storage_.ranks_true_ref_[rank_id];
+
+  size_type word_id = rank_id * words_per_block;
+  word_id += subtract_rank_from_count(count, rank);
+
+  return word_id * bits_per_word + select_bit_in_word(count, storage_.words_ref_[word_id]);
+}
+
+template <class Allocator>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::select_false(size_type count) const noexcept
+{
+  auto rank_id =
+    initial_rank_estimate(count, storage_.selects_false_ref_, storage_.ranks_false_ref_);
+  auto rank = storage_.ranks_false_ref_[rank_id];
+
+  size_type word_id = rank_id * words_per_block;
+  word_id += subtract_rank_from_count(count, rank);
+
+  return word_id * bits_per_word + select_bit_in_word(count, ~(storage_.words_ref_[word_id]));
+}
+
+template <class Allocator>
+template <typename SelectsRef, typename RanksRef>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::initial_rank_estimate(size_type count,
+                                                            SelectsRef const& selects,
+                                                            RanksRef const& ranks) const noexcept
+{
+  size_type block_id = count / (bits_per_word * words_per_block);
+  size_type begin    = selects[block_id];
+  size_type end      = selects[block_id + 1] + 1UL;
+
+  if (begin + 10 >= end) {  // Linear search
+    while (count >= ranks[begin + 1].base()) {
+      ++begin;
+    }
+  } else {  // Binary search
+    while (begin + 1 < end) {
+      size_type middle = (begin + end) / 2;
+      if (count < ranks[middle].base()) {
+        end = middle;
+      } else {
+        begin = middle;
+      }
+    }
+  }
+  return begin;
+}
+
+template <class Allocator>
+template <typename Rank>
+__device__ constexpr typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::subtract_rank_from_count(size_type& count,
+                                                               Rank rank) const noexcept
+{
+  count -= rank.base();
+
+  bool a0       = count >= rank.offsets_[0];
+  bool a1       = count >= rank.offsets_[1];
+  bool a2       = count >= rank.offsets_[2];
+  size_type inc = a0 + a1 + a2;
+
+  count -= (inc > 0) * rank.offsets_[inc - (inc > 0)];
+
+  return inc;
+}
+
+template <class Allocator>
+__device__ typename dynamic_bitset<Allocator>::size_type
+dynamic_bitset<Allocator>::reference::select_bit_in_word(size_type N, word_type word) const noexcept
+{
+  for (size_type pos = 0; pos < N; pos++) {
+    word &= word - 1;
+  }
+  return __ffsll(word & -word) - 1;  // cuda intrinsic
+}
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/include/cuco/detail/trie/dynamic_bitset/kernels.cuh b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
new file mode 100644
index 000000000..c92ab60b2
--- /dev/null
+++ b/include/cuco/detail/trie/dynamic_bitset/kernels.cuh
@@ -0,0 +1,240 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuco/detail/utility/cuda.cuh>
+#include <cuco/detail/utility/cuda.hpp>
+
+#include <cuda/std/bit>
+
+namespace cuco {
+namespace experimental {
+namespace detail {
+
+/*
+ * @brief Test bits for a range of keys
+ *
+ * @tparam BitsetRef Bitset reference type
+ * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+ * `size_type`
+ * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from boolean
+ * type
+ *
+ * @param ref Bitset ref
+ * @param keys Begin iterator to keys
+ * @param outputs Begin iterator to outputs
+ * @param num_keys Number of input keys
+ */
+template <typename BitsetRef, typename KeyIt, typename OutputIt>
+__global__ void bitset_test_kernel(BitsetRef ref,
+                                   KeyIt keys,
+                                   OutputIt outputs,
+                                   cuco::detail::index_type num_keys)
+{
+  auto key_id       = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (key_id < num_keys) {
+    outputs[key_id] = ref.test(keys[key_id]);
+    key_id += stride;
+  }
+}
+
+/*
+ * @brief Gather rank values for a range of keys
+ *
+ * @tparam BitsetRef Bitset reference type
+ * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+ * `size_type`
+ * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+ * `size_type`
+ *
+ * @param ref Bitset ref
+ * @param keys Begin iterator to keys
+ * @param outputs Begin iterator to outputs
+ * @param num_keys Number of input keys
+ */
+template <typename BitsetRef, typename KeyIt, typename OutputIt>
+__global__ void bitset_rank_kernel(BitsetRef ref,
+                                   KeyIt keys,
+                                   OutputIt outputs,
+                                   cuco::detail::index_type num_keys)
+{
+  auto key_id       = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (key_id < num_keys) {
+    outputs[key_id] = ref.rank(keys[key_id]);
+    key_id += stride;
+  }
+}
+
+/*
+ * @brief Gather select values for a range of keys
+ *
+ * @tparam BitsetRef Bitset reference type
+ * @tparam KeyIt Device-accessible iterator whose `value_type` can be converted to bitset's
+ * `size_type`
+ * @tparam OutputIt Device-accessible iterator whose `value_type` can be constructed from bitset's
+ * `size_type`
+ *
+ * @param ref Bitset ref
+ * @param keys Begin iterator to keys
+ * @param outputs Begin iterator to outputs
+ * @param num_keys Number of input keys
+ */
+template <typename BitsetRef, typename KeyIt, typename OutputIt>
+__global__ void bitset_select_kernel(BitsetRef ref,
+                                     KeyIt keys,
+                                     OutputIt outputs,
+                                     cuco::detail::index_type num_keys)
+{
+  auto key_id       = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (key_id < num_keys) {
+    outputs[key_id] = ref.select(keys[key_id]);
+    key_id += stride;
+  }
+}
+
+/*
+ * @brief Computes number of set or not-set bits in each word
+ *
+ * @tparam WordType Word type
+ * @tparam SizeType Size type
+ *
+ * @param words Input array of words
+ * @param bit_counts Output array of per-word bit counts
+ * @param num_words Number of words
+ * @param flip_bits Boolean to request negation of words before counting bits
+ */
+template <typename WordType, typename SizeType>
+__global__ void bit_counts_kernel(WordType const* words,
+                                  SizeType* bit_counts,
+                                  cuco::detail::index_type num_words,
+                                  bool flip_bits)
+{
+  auto word_id      = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (word_id < num_words) {
+    auto word           = words[word_id];
+    bit_counts[word_id] = cuda::std::popcount(flip_bits ? ~word : word);
+    word_id += stride;
+  }
+}
+
+/*
+ * @brief Compute rank values at block size intervals.
+ *
+ * ranks[i] = Number of set bits in [0, i) range
+ * This kernel transforms prefix sum array of per-word bit counts
+ * into base-delta encoding style of `rank` struct.
+ * Since prefix sum is available, there are no dependencies across blocks.
+
+ * @tparam SizeType Size type
+ *
+ * @param prefix_bit_counts Prefix sum array of per-word bit counts
+ * @param ranks Output array of ranks
+ * @param num_words Length of input array
+ * @param num_blocks Length of ouput array
+ * @param words_per_block Number of words in each block
+ */
+template <typename SizeType>
+__global__ void encode_ranks_from_prefix_bit_counts(const SizeType* prefix_bit_counts,
+                                                    rank* ranks,
+                                                    SizeType num_words,
+                                                    SizeType num_blocks,
+                                                    SizeType words_per_block)
+{
+  auto rank_id      = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (rank_id < num_blocks) {
+    SizeType word_id = rank_id * words_per_block;
+
+    // Set base value of rank
+    auto& rank = ranks[rank_id];
+    rank.set_base(prefix_bit_counts[word_id]);
+
+    if (rank_id < num_blocks - 1) {
+      // For each subsequent word in this block, compute deltas from base
+      for (SizeType block_offset = 0; block_offset < words_per_block - 1; block_offset++) {
+        auto delta = prefix_bit_counts[word_id + block_offset + 1] - prefix_bit_counts[word_id];
+        rank.offsets_[block_offset] = delta;
+      }
+    }
+    rank_id += stride;
+  }
+}
+
+/*
+ * @brief Compute select values at block size intervals.
+ *
+ * selects[i] = Position of (i+ 1)th set bit
+ * This kernel check for blocks where prefix sum crosses a multiple of `bits_per_block`.
+ * Such blocks are marked in the output boolean array
+ *
+ * @tparam SizeType Size type
+ *
+ * @param prefix_bit_counts Prefix sum array of per-word bit counts
+ * @param selects_markers Ouput array indicating whether a block has selects entry or not
+ * @param num_blocks Length of ouput array
+ * @param words_per_block Number of words in each block
+ * @param bits_per_block Number of bits in each block
+ */
+template <typename SizeType>
+__global__ void mark_blocks_with_select_entries(SizeType const* prefix_bit_counts,
+                                                SizeType* select_markers,
+                                                SizeType num_blocks,
+                                                SizeType words_per_block,
+                                                SizeType bits_per_block)
+{
+  auto block_id     = cuco::detail::global_thread_id();
+  auto const stride = cuco::detail::grid_stride();
+
+  while (block_id < num_blocks) {
+    if (block_id == 0) {  // Block 0 always has a selects entry
+      select_markers[block_id] = 1;
+      block_id += stride;
+      continue;
+    }
+
+    select_markers[block_id] = 0;  // Always clear marker first
+    SizeType word_id         = block_id * words_per_block;
+    SizeType prev_count      = prefix_bit_counts[word_id];
+
+    for (size_t block_offset = 1; block_offset <= words_per_block; block_offset++) {
+      SizeType count = prefix_bit_counts[word_id + block_offset];
+
+      // Selects entry is added when cumulative bitcount crosses a multiple of bits_per_block
+      if ((prev_count - 1) / bits_per_block != (count - 1) / bits_per_block) {
+        select_markers[block_id] = 1;
+        break;
+      }
+      prev_count = count;
+    }
+
+    block_id += stride;
+  }
+}
+
+}  // namespace detail
+}  // namespace experimental
+}  // namespace cuco
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index d78ec7f49..3deeeddf1 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -96,3 +96,12 @@ ConfigureTest(STATIC_MULTIMAP_TEST
     static_multimap/multiplicity_test.cu
     static_multimap/non_match_test.cu
     static_multimap/pair_function_test.cu)
+
+###################################################################################################
+# - dynamic_bitset tests --------------------------------------------------------------------------
+ConfigureTest(DYNAMIC_BITSET_TEST
+    dynamic_bitset/find_next_test.cu
+    dynamic_bitset/get_test.cu
+    dynamic_bitset/rank_test.cu
+    dynamic_bitset/select_test.cu
+    dynamic_bitset/size_test.cu)
diff --git a/tests/dynamic_bitset/find_next_test.cu b/tests/dynamic_bitset/find_next_test.cu
new file mode 100644
index 000000000..97ba366ea
--- /dev/null
+++ b/tests/dynamic_bitset/find_next_test.cu
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+template <class BitsetRef, typename size_type, typename OutputIt>
+__global__ void find_next_kernel(BitsetRef ref, size_type num_elements, OutputIt output)
+{
+  cuco::detail::index_type index  = blockIdx.x * blockDim.x + threadIdx.x;
+  cuco::detail::index_type stride = gridDim.x * blockDim.x;
+  while (index < num_elements) {
+    output[index] = ref.find_next(index);
+    index += stride;
+  }
+}
+
+extern bool modulo_bitgen(uint64_t i);  // Defined in get_test.cu
+
+TEST_CASE("Find next set test", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+
+  using size_type = std::size_t;
+  constexpr size_type num_elements{400};
+
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(modulo_bitgen(i));
+  }
+
+  thrust::device_vector<size_type> device_result(num_elements);
+  auto ref = bv.ref();
+  find_next_kernel<<<1, 1024>>>(ref, num_elements, device_result.data());
+
+  thrust::host_vector<size_type> host_result = device_result;
+  size_type num_matches                      = 0;
+
+  size_type next_set_pos = -1lu;
+  do {
+    next_set_pos++;
+  } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos));
+
+  for (size_type key = 0; key < num_elements; key++) {
+    num_matches += host_result[key] == next_set_pos;
+
+    if (key == next_set_pos) {
+      do {
+        next_set_pos++;
+      } while (next_set_pos < num_elements and !modulo_bitgen(next_set_pos));
+    }
+  }
+  REQUIRE(num_matches == num_elements);
+}
diff --git a/tests/dynamic_bitset/get_test.cu b/tests/dynamic_bitset/get_test.cu
new file mode 100644
index 000000000..10f81a116
--- /dev/null
+++ b/tests/dynamic_bitset/get_test.cu
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <catch2/catch_test_macros.hpp>
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/sequence.h>
+#include <utils.hpp>
+
+template <class BitsetRef, typename size_type, typename OutputIt>
+__global__ void test_kernel(BitsetRef ref, size_type num_elements, OutputIt output)
+{
+  cuco::detail::index_type index  = blockIdx.x * blockDim.x + threadIdx.x;
+  cuco::detail::index_type stride = gridDim.x * blockDim.x;
+  while (index < num_elements) {
+    output[index] = ref.test(index);
+    index += stride;
+  }
+}
+
+bool modulo_bitgen(uint64_t i) { return i % 7 == 0; }
+
+TEST_CASE("Get test", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+
+  using size_type = std::size_t;
+  constexpr size_type num_elements{400};
+
+  size_type num_set_ref = 0;
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(modulo_bitgen(i));
+    num_set_ref += modulo_bitgen(i);
+  }
+
+  // Host-bulk test
+  thrust::device_vector<size_type> keys(num_elements);
+  thrust::sequence(keys.begin(), keys.end(), 0);
+
+  thrust::device_vector<size_type> test_result(num_elements);
+  thrust::fill(test_result.begin(), test_result.end(), 0);
+
+  bv.test(keys.begin(), keys.end(), test_result.begin());
+
+  size_type num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0);
+  REQUIRE(num_set == num_set_ref);
+
+  // Device-ref test
+  auto ref = bv.ref();
+  thrust::fill(test_result.begin(), test_result.end(), 0);
+  test_kernel<<<1, 1024>>>(ref, num_elements, test_result.data());
+
+  num_set = thrust::reduce(thrust::device, test_result.begin(), test_result.end(), 0);
+  REQUIRE(num_set == num_set_ref);
+}
diff --git a/tests/dynamic_bitset/rank_test.cu b/tests/dynamic_bitset/rank_test.cu
new file mode 100644
index 000000000..3b4d17cca
--- /dev/null
+++ b/tests/dynamic_bitset/rank_test.cu
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+extern bool modulo_bitgen(uint64_t i);  // Defined in get_test.cu
+
+TEST_CASE("Rank test", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+
+  using size_type = std::size_t;
+  constexpr size_type num_elements{4000};
+
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(modulo_bitgen(i));
+  }
+
+  thrust::device_vector<size_type> keys(num_elements);
+  thrust::sequence(keys.begin(), keys.end(), 0);
+
+  thrust::device_vector<size_type> d_ranks(num_elements);
+
+  bv.rank(keys.begin(), keys.end(), d_ranks.begin());
+
+  thrust::host_vector<size_type> h_ranks = d_ranks;
+
+  size_type cur_rank    = 0;
+  size_type num_matches = 0;
+  for (size_type i = 0; i < num_elements; i++) {
+    num_matches += cur_rank == h_ranks[i];
+    if (modulo_bitgen(i)) { cur_rank++; }
+  }
+  REQUIRE(num_matches == num_elements);
+}
diff --git a/tests/dynamic_bitset/select_test.cu b/tests/dynamic_bitset/select_test.cu
new file mode 100644
index 000000000..3dc0d74da
--- /dev/null
+++ b/tests/dynamic_bitset/select_test.cu
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <utils.hpp>
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/sequence.h>
+
+#include <catch2/catch_test_macros.hpp>
+
+template <class BitsetRef, typename size_type, typename OutputIt>
+__global__ void select_false_kernel(BitsetRef ref, size_type num_elements, OutputIt output)
+{
+  cuco::detail::index_type index  = blockIdx.x * blockDim.x + threadIdx.x;
+  cuco::detail::index_type stride = gridDim.x * blockDim.x;
+  while (index < num_elements) {
+    output[index] = ref.select_false(index);
+    index += stride;
+  }
+}
+
+extern bool modulo_bitgen(uint64_t i);  // Defined in get_test.cu
+
+TEST_CASE("Select test", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+
+  using size_type = std::size_t;
+  constexpr size_type num_elements{4000};
+
+  size_type num_set = 0;
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(modulo_bitgen(i));
+    num_set += modulo_bitgen(i);
+  }
+
+  // Check select
+  {
+    thrust::device_vector<size_type> keys(num_set);
+    thrust::sequence(keys.begin(), keys.end(), 0);
+
+    thrust::device_vector<size_type> d_selects(num_set);
+
+    bv.select(keys.begin(), keys.end(), d_selects.begin());
+
+    thrust::host_vector<size_type> h_selects = d_selects;
+
+    size_type num_matches = 0;
+    size_type cur_set_pos = -1lu;
+    for (size_type i = 0; i < num_set; i++) {
+      do {
+        cur_set_pos++;
+      } while (cur_set_pos < num_elements and !modulo_bitgen(cur_set_pos));
+
+      num_matches += cur_set_pos == h_selects[i];
+    }
+    REQUIRE(num_matches == num_set);
+  }
+
+  // Check select_false
+  {
+    size_type num_not_set = num_elements - num_set;
+
+    auto ref = bv.ref();
+    thrust::device_vector<size_type> device_result(num_not_set);
+    select_false_kernel<<<1, 1024>>>(ref, num_not_set, device_result.data());
+    thrust::host_vector<size_type> host_result = device_result;
+
+    size_type num_matches     = 0;
+    size_type cur_not_set_pos = -1lu;
+    for (size_type i = 0; i < num_not_set; i++) {
+      do {
+        cur_not_set_pos++;
+      } while (cur_not_set_pos < num_elements and modulo_bitgen(cur_not_set_pos));
+
+      num_matches += cur_not_set_pos == host_result[i];
+    }
+    REQUIRE(num_matches == num_not_set);
+  }
+}
diff --git a/tests/dynamic_bitset/size_test.cu b/tests/dynamic_bitset/size_test.cu
new file mode 100644
index 000000000..611159dc3
--- /dev/null
+++ b/tests/dynamic_bitset/size_test.cu
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh>
+
+#include <catch2/catch_test_macros.hpp>
+
+TEST_CASE("Size computation", "")
+{
+  cuco::experimental::detail::dynamic_bitset bv;
+  using size_type = std::size_t;
+  constexpr size_type num_elements{400};
+
+  for (size_type i = 0; i < num_elements; i++) {
+    bv.push_back(i % 2 == 0);  // Alternate 0s and 1s pattern
+  }
+
+  auto size = bv.size();
+  REQUIRE(size == num_elements);
+}