From 657c4faf21700c0899703a4759bde76235c38199 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Tue, 14 May 2024 17:54:40 -0300 Subject: [PATCH] GH-41596: [C++] fixed_width_internal.h: Simplify docstring and support bit-sized types (BOOL) (#41597) ### Rationale for this change Post-merge feedback from #41297. ### What changes are included in this PR? - Supporting `BOOL` as both a top-level and nested in FSL types - Removing the long example from the docstring of `IsFixedWidthLike` These changes don't affect users because this header was added recently and not released. ### Are these changes tested? Yes, by existing and new test cases. * GitHub Issue: #41596 Authored-by: Felipe Oliveira Carvalho Signed-off-by: Felipe Oliveira Carvalho --- .../vector_selection_filter_internal.cc | 4 +- .../kernels/vector_selection_internal.cc | 4 +- .../kernels/vector_selection_take_internal.cc | 5 +- cpp/src/arrow/util/fixed_width_internal.cc | 100 +++--- cpp/src/arrow/util/fixed_width_internal.h | 286 +++++++++--------- cpp/src/arrow/util/fixed_width_test.cc | 21 +- 6 files changed, 212 insertions(+), 208 deletions(-) diff --git a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc index 8d43c65668d4b..5e24331fe96f2 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_filter_internal.cc @@ -164,7 +164,7 @@ class PrimitiveFilterImpl { values_is_valid_(values.buffers[0].data), // No offset applied for boolean because it's a bitmap values_data_(kIsBoolean ? values.buffers[1].data - : util::OffsetPointerOfFixedWidthValues(values)), + : util::OffsetPointerOfFixedByteWidthValues(values)), values_null_count_(values.null_count), values_offset_(values.offset), values_length_(values.length), @@ -470,7 +470,7 @@ Status PrimitiveFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult // validity bitmap. const bool allocate_validity = values.null_count != 0 || !filter_null_count_is_zero; - DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false)); + DCHECK(util::IsFixedWidthLike(values)); const int64_t bit_width = util::FixedWidthInBits(*values.type); RETURN_NOT_OK(util::internal::PreallocateFixedWidthArrayData( ctx, output_length, /*source=*/values, allocate_validity, out_arr)); diff --git a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc index 93cd5060348db..2ba660e49ac38 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_internal.cc @@ -898,7 +898,7 @@ Status FSLFilterExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) // PrimitiveFilterExec for a fixed-size list array. if (util::IsFixedWidthLike(values, /*force_null_count=*/true, - /*exclude_dictionary=*/true)) { + /*exclude_bool_and_dictionary=*/true)) { const auto byte_width = util::FixedWidthInBytes(*values.type); // 0 is a valid byte width for FixedSizeList, but PrimitiveFilterExec // might not handle it correctly. @@ -971,7 +971,7 @@ Status FSLTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { // PrimitiveTakeExec for a fixed-size list array. if (util::IsFixedWidthLike(values, /*force_null_count=*/true, - /*exclude_dictionary=*/true)) { + /*exclude_bool_and_dictionary=*/true)) { const auto byte_width = util::FixedWidthInBytes(*values.type); // Additionally, PrimitiveTakeExec is only implemented for specific byte widths. // TODO(GH-41301): Extend PrimitiveTakeExec for any fixed-width type. diff --git a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc index 48a2de9936cd4..1a9af0efcd700 100644 --- a/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc +++ b/cpp/src/arrow/compute/kernels/vector_selection_take_internal.cc @@ -347,7 +347,7 @@ struct PrimitiveTakeImpl { static void Exec(const ArraySpan& values, const ArraySpan& indices, ArrayData* out_arr) { DCHECK_EQ(util::FixedWidthInBytes(*values.type), kValueWidth); - const auto* values_data = util::OffsetPointerOfFixedWidthValues(values); + const auto* values_data = util::OffsetPointerOfFixedByteWidthValues(values); const uint8_t* values_is_valid = values.buffers[0].data; auto values_offset = values.offset; @@ -588,8 +588,7 @@ Status PrimitiveTakeExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ArrayData* out_arr = out->array_data().get(); - DCHECK(util::IsFixedWidthLike(values, /*force_null_count=*/false, - /*exclude_dictionary=*/true)); + DCHECK(util::IsFixedWidthLike(values)); const int64_t bit_width = util::FixedWidthInBits(*values.type); // TODO: When neither values nor indices contain nulls, we can skip diff --git a/cpp/src/arrow/util/fixed_width_internal.cc b/cpp/src/arrow/util/fixed_width_internal.cc index 164af3cff66b3..3f12fafb54f0f 100644 --- a/cpp/src/arrow/util/fixed_width_internal.cc +++ b/cpp/src/arrow/util/fixed_width_internal.cc @@ -33,11 +33,12 @@ namespace arrow::util { using ::arrow::internal::checked_cast; bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, - bool exclude_dictionary) { - return IsFixedWidthLike(source, force_null_count, - [exclude_dictionary](const DataType& type) { - return !exclude_dictionary || type.id() != Type::DICTIONARY; - }); + bool exclude_bool_and_dictionary) { + return IsFixedWidthLike( + source, force_null_count, [exclude_bool_and_dictionary](const DataType& type) { + return !exclude_bool_and_dictionary || + (type.id() != Type::DICTIONARY && type.id() != Type::BOOL); + }); } static int64_t FixedWidthInBytesFallback(const FixedSizeListType& fixed_size_list_type) { @@ -73,16 +74,37 @@ int64_t FixedWidthInBytes(const DataType& type) { return -1; } +static int64_t FixedWidthInBitsFallback(const FixedSizeListType& fixed_size_list_type) { + auto* fsl = &fixed_size_list_type; + int64_t list_size = fsl->list_size(); + for (auto type = fsl->value_type().get();;) { + auto type_id = type->id(); + if (type_id == Type::FIXED_SIZE_LIST) { + fsl = checked_cast(type); + list_size *= fsl->list_size(); + type = fsl->value_type().get(); + continue; + } + if (is_fixed_width(type_id)) { + const int64_t flat_bit_width = list_size * type->bit_width(); + DCHECK_GE(flat_bit_width, 0); + return flat_bit_width; + } + break; + } + return -1; +} + int64_t FixedWidthInBits(const DataType& type) { auto type_id = type.id(); if (is_fixed_width(type_id)) { return type.bit_width(); } - const int64_t byte_width = FixedWidthInBytes(type); - if (ARROW_PREDICT_FALSE(byte_width < 0)) { - return -1; + if (type_id == Type::FIXED_SIZE_LIST) { + auto& fsl = ::arrow::internal::checked_cast(type); + return FixedWidthInBitsFallback(fsl); } - return byte_width * 8; + return -1; } namespace internal { @@ -121,9 +143,6 @@ Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, if (type->id() == Type::FIXED_SIZE_LIST) { auto& fsl_type = checked_cast(*type); auto& value_type = fsl_type.value_type(); - if (ARROW_PREDICT_FALSE(value_type->id() == Type::BOOL)) { - return Status::Invalid("PreallocateFixedWidthArrayData: Invalid type: ", fsl_type); - } if (ARROW_PREDICT_FALSE(value_type->id() == Type::DICTIONARY)) { return Status::NotImplemented( "PreallocateFixedWidthArrayData: DICTIONARY type allocation: ", *type); @@ -146,16 +165,13 @@ Status PreallocateFixedWidthArrayData(::arrow::compute::KernelContext* ctx, } // namespace internal -/// \pre same as OffsetPointerOfFixedWidthValues -/// \pre source.type->id() != Type::BOOL -static const uint8_t* OffsetPointerOfFixedWidthValuesFallback(const ArraySpan& source) { +std::pair OffsetPointerOfFixedBitWidthValues( + const ArraySpan& source) { using OffsetAndListSize = std::pair; auto get_offset = [](auto pair) { return pair.first; }; auto get_list_size = [](auto pair) { return pair.second; }; ::arrow::internal::SmallVector stack; - DCHECK_NE(source.type->id(), Type::BOOL); - int64_t list_size = 1; auto* array = &source; while (array->type->id() == Type::FIXED_SIZE_LIST) { @@ -166,31 +182,25 @@ static const uint8_t* OffsetPointerOfFixedWidthValuesFallback(const ArraySpan& s // Now that innermost values were reached, pop the stack and calculate the offset // in bytes of the innermost values buffer by considering the offset at each // level of nesting. - DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type)); + DCHECK(is_fixed_width(*array->type)); DCHECK(array == &source || !array->MayHaveNulls()) << "OffsetPointerOfFixedWidthValues: array is expected to be flat or have no " "nulls in the arrays nested by FIXED_SIZE_LIST."; - int64_t value_width = array->type->byte_width(); - int64_t offset_in_bytes = array->offset * value_width; + int64_t value_width_in_bits = array->type->bit_width(); + int64_t offset_in_bits = array->offset * value_width_in_bits; for (auto it = stack.rbegin(); it != stack.rend(); ++it) { - value_width *= get_list_size(*it); - offset_in_bytes += get_offset(*it) * value_width; + value_width_in_bits *= get_list_size(*it); + offset_in_bits += get_offset(*it) * value_width_in_bits; } - return value_width < 0 ? nullptr : array->GetValues(1, offset_in_bytes); + DCHECK_GE(value_width_in_bits, 0); + const auto* values_ptr = array->GetValues(1, 0); + return {static_cast(offset_in_bits % 8), values_ptr + (offset_in_bits / 8)}; } -const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source) { - auto type_id = source.type->id(); - if (is_fixed_width(type_id)) { - if (ARROW_PREDICT_FALSE(type_id == Type::BOOL)) { - // BOOL arrays are bit-packed, thus a byte-aligned pointer cannot be produced in the - // general case. Returning something for BOOL arrays that happen to byte-align - // because offset=0 would create too much confusion. - return nullptr; - } - return source.GetValues(1, 0) + source.offset * source.type->byte_width(); - } - return OffsetPointerOfFixedWidthValuesFallback(source); +const uint8_t* OffsetPointerOfFixedByteWidthValues(const ArraySpan& source) { + DCHECK(IsFixedWidthLike(source, /*force_null_count=*/false, + [](const DataType& type) { return type.id() != Type::BOOL; })); + return OffsetPointerOfFixedBitWidthValues(source).second; } /// \brief Get the mutable pointer to the fixed-width values of an array @@ -203,24 +213,20 @@ const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source) { /// \return The mutable pointer to the fixed-width byte blocks of the array. If /// pre-conditions are not satisfied, the return values is undefined. uint8_t* MutableFixedWidthValuesPointer(ArrayData* mutable_array) { - auto type_id = mutable_array->type->id(); - if (type_id == Type::FIXED_SIZE_LIST) { - auto* array = mutable_array; - do { - DCHECK_EQ(array->offset, 0); - DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of " - << mutable_array->type->ToString(true); - array = array->child_data[0].get(); - } while (array->type->id() == Type::FIXED_SIZE_LIST); + auto* array = mutable_array; + auto type_id = array->type->id(); + while (type_id == Type::FIXED_SIZE_LIST) { DCHECK_EQ(array->offset, 0); - DCHECK(array->type->id() != Type::BOOL && is_fixed_width(*array->type)); - return array->GetMutableValues(1, 0); + DCHECK_EQ(array->child_data.size(), 1) << array->type->ToString(true) << " part of " + << mutable_array->type->ToString(true); + array = array->child_data[0].get(); + type_id = array->type->id(); } DCHECK_EQ(mutable_array->offset, 0); // BOOL is allowed here only because the offset is expected to be 0, // so the byte-aligned pointer also points to the first *bit* of the buffer. DCHECK(is_fixed_width(type_id)); - return mutable_array->GetMutableValues(1, 0); + return array->GetMutableValues(1, 0); } } // namespace arrow::util diff --git a/cpp/src/arrow/util/fixed_width_internal.h b/cpp/src/arrow/util/fixed_width_internal.h index f6959485fbd01..232411f4c4a56 100644 --- a/cpp/src/arrow/util/fixed_width_internal.h +++ b/cpp/src/arrow/util/fixed_width_internal.h @@ -56,146 +56,140 @@ namespace arrow::util { /// Additionally, we say that a type is "fixed-width like" if it's a fixed-width as /// defined above, or if it's a fixed-size list (or nested fixed-size lists) and /// the innermost type is fixed-width and the following restrictions also apply: -/// - The value type of the innermost fixed-size list is not BOOL (it has to be excluded -/// because a 1-bit type doesn't byte-align) /// - Only the top-level array may have nulls, all the inner array have to be completely /// free of nulls so we don't need to manage internal validity bitmaps. /// -/// Take the following `fixed_size_list, 3>` array as an -/// example: -/// -/// [ -/// [[1, 2], [3, 4], [ 5, 6]], -/// null, -/// [[7, 8], [9, 10], [11, 12]] -/// ] -/// -/// in memory, it would look like: -/// -/// { -/// type: fixed_size_list, 3>, -/// length: 3, -/// null_count: 1, -/// offset: 0, -/// buffers: [ -/// 0: [0b00000101] -/// ], -/// child_data: [ -/// 0: { -/// type: fixed_size_list, -/// length: 9, -/// null_count: 0, -/// offset: 0, -/// buffers: [0: NULL], -/// child_data: [ -/// 0: { -/// type: int32, -/// length: 18, -/// null_count: 0, -/// offset: 0, -/// buffers: [ -/// 0: NULL, -/// 1: [ 1, 2, 3, 4, 5, 6, -/// 0, 0, 0, 0, 0, 0 -/// 7, 8, 9, 10, 11, 12 ] -/// ], -/// child_data: [] -/// } -/// ] -/// } -/// ] -/// } -/// -/// This layout fits the fixed-width like definition because the innermost type -/// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't -/// have nulls. The validity bitmap is only needed at the top-level array. -/// -/// Writing to this array can be done in the same way writing to a flat fixed-width -/// array is done, by: -/// 1. Updating the validity bitmap at the top-level array if nulls are present. -/// 2. Updating a continuous fixed-width block of memory through a single pointer. -/// -/// The length of this block of memory is the product of the list sizes in the -/// `FixedSizeList` types and the byte width of the innermost fixed-width type: -/// -/// 3 * 2 * 4 = 24 bytes -/// -/// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done by -/// simply setting the validity bit to 1 and writing the 24-byte sequence of -/// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 24`. -/// -/// The length of the top-level array fully defines the lengths that all the nested -/// arrays must have, which makes defining all the lengths as easy as defining the -/// length of the top-level array. -/// -/// length = 3 -/// child_data[0].length == 3 * 3 == 9 -/// child_data[0].child_data[0].length == 3 * 3 * 2 == 18 -/// -/// child_data[0].child_data[0].buffers[1].size() >= -/// (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72) -/// -/// Dealing with offsets is a bit involved. Let's say the array described above has -/// the offsets 2, 5, and 7: -/// -/// { -/// type: fixed_size_list, 3>, -/// offset: 2, -/// ... -/// child_data: [ -/// 0: { -/// type: fixed_size_list, -/// offset: 5, -/// ... -/// child_data: [ -/// 0: { -/// type: int32, -/// offset: 7, -/// buffers: [ -/// 0: NULL, -/// 1: [ 1, 1, 1, 1, 1, 1, 1, // 7 values skipped -/// 0,1, 0,1, 0,1, 0,1, 0,1, // 5 [x,x] values skipped -/// -/// 0,0,0,0,0,1, // -/// 0,0,0,0,0,1, // 2 [[x,x], [x,x], [x,x]] values skipped -/// -/// 1, 2, 3, 4, 5, 6, // -/// 0, 0, 0, 0, 0, 0 // the actual values -/// 7, 8, 9, 10, 11, 12 // -/// ] -/// ], -/// } -/// ] -/// } -/// ] -/// } -/// -/// The offset of the innermost values buffer, in bytes, is calculated as: -/// -/// ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes -/// -/// In general, the formula to calculate the offset of the innermost values buffer is: -/// -/// ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off) -/// * sizeof(innermost_type) -/// -/// `OffsetPointerOfFixedWidthValues()` can calculate this byte offset and return the -/// pointer to the first relevant byte of the innermost values buffer. -/// /// \param source The array to check /// \param force_null_count If true, GetNullCount() is used instead of null_count -/// \param exclude_dictionary If true, DICTIONARY is excluded from the -/// is_fixed_width() types. Default: false. +/// \param exclude_bool_and_dictionary If true, BOOL and DICTIONARY are excluded from +/// the is_fixed_width() types. Default: false. ARROW_EXPORT bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count = false, - bool exclude_dictionary = false); + bool exclude_bool_and_dictionary = false); + +// Take the following `fixed_size_list, 3>` array as an +// example: +// +// [ +// [[1, 2], [3, 4], [ 5, 6]], +// null, +// [[7, 8], [9, 10], [11, 12]] +// ] +// +// in memory, it would look like: +// +// { +// type: fixed_size_list, 3>, +// length: 3, +// null_count: 1, +// offset: 0, +// buffers: [ +// 0: [0b00000101] +// ], +// child_data: [ +// 0: { +// type: fixed_size_list, +// length: 9, +// null_count: 0, +// offset: 0, +// buffers: [0: NULL], +// child_data: [ +// 0: { +// type: int32, +// length: 18, +// null_count: 0, +// offset: 0, +// buffers: [ +// 0: NULL, +// 1: [ 1, 2, 3, 4, 5, 6, +// 0, 0, 0, 0, 0, 0 +// 7, 8, 9, 10, 11, 12 ] +// ], +// child_data: [] +// } +// ] +// } +// ] +// } +// +// This layout fits the fixed-width like definition because the innermost type +// is byte-aligned fixed-width (int32 = 4 bytes) and the internal arrays don't +// have nulls. The validity bitmap is only needed at the top-level array. +// +// Writing to this array can be done in the same way writing to a flat fixed-width +// array is done, by: +// 1. Updating the validity bitmap at the top-level array if nulls are present. +// 2. Updating a continuous fixed-width block of memory through a single pointer. +// +// The length of this block of memory is the product of the list sizes in the +// `FixedSizeList` types and the byte width of the innermost fixed-width type: +// +// 3 * 2 * 4 = 24 bytes +// +// Writing the `[[1, 2], [3, 4], [5, 6]]` value at a given index can be done by +// simply setting the validity bit to 1 and writing the 24-byte sequence of +// integers `[1, 2, 3, 4, 5, 6]` to the memory block at `byte_ptr + index * 24`. +// +// The length of the top-level array fully defines the lengths that all the nested +// arrays must have, which makes defining all the lengths as easy as defining the +// length of the top-level array. +// +// length = 3 +// child_data[0].length == 3 * 3 == 9 +// child_data[0].child_data[0].length == 3 * 3 * 2 == 18 +// +// child_data[0].child_data[0].buffers[1].size() >= +// (3 * (3 * 2 * sizeof(int32)) == 3 * 24 == 72) +// +// Dealing with offsets is a bit involved. Let's say the array described above has +// the offsets 2, 5, and 7: +// +// { +// type: fixed_size_list, 3>, +// offset: 2, +// ... +// child_data: [ +// 0: { +// type: fixed_size_list, +// offset: 5, +// ... +// child_data: [ +// 0: { +// type: int32, +// offset: 7, +// buffers: [ +// 0: NULL, +// 1: [ 1, 1, 1, 1, 1, 1, 1, // 7 values skipped +// 0,1, 0,1, 0,1, 0,1, 0,1, // 5 [x,x] values skipped +// +// 0,0,0,0,0,1, // +// 0,0,0,0,0,1, // 2 [[x,x], [x,x], [x,x]] values skipped +// +// 1, 2, 3, 4, 5, 6, // +// 0, 0, 0, 0, 0, 0 // the actual values +// 7, 8, 9, 10, 11, 12 // +// ] +// ], +// } +// ] +// } +// ] +// } +// +// The offset of the innermost values buffer, in bytes, is calculated as: +// +// ((2 * 3) + (5 * 2) + 7) * sizeof(int32) = 29 * 4 bytes = 116 bytes +// +// In general, the formula to calculate the offset of the innermost values buffer is: +// +// ((off_0 * fsl_size_0) + (off_1 * fsl_size_1) + ... + innermost_off) +// * sizeof(innermost_type) +// +// `OffsetPointerOfFixedByteWidthValues()` can calculate this byte offset and return +// the pointer to the first relevant byte of the innermost values buffer. /// \brief Checks if the given array has a fixed-width type or if it's an array of /// fixed-size list that can be flattened to an array of fixed-width values. /// -/// This function is a more general version of -/// `IsFixedWidthLike(const ArraySpan&, bool)` that allows the caller to further -/// restrict the inner value types that should be considered fixed-width. -/// /// \param source The array to check /// \param force_null_count If true, GetNullCount() is used instead of null_count /// \param extra_predicate A DataType predicate that can be used to further @@ -217,9 +211,7 @@ inline bool IsFixedWidthLike(const ArraySpan& source, bool force_null_count, values = &values->child_data[0]; continue; } - // BOOL has to be excluded because it's not byte-aligned. - return type->id() != Type::BOOL && is_fixed_width(type->id()) && - extra_predicate(*type); + return is_fixed_width(type->id()) && extra_predicate(*type); } } return false; @@ -251,6 +243,10 @@ ARROW_EXPORT int64_t FixedWidthInBytes(const DataType& type); /// \brief Get the fixed-width in bits of a type if it is a fixed-width like /// type. /// +/// If the array is a FixedSizeList (of any level of nesting), the bit width of +/// the values is the product of all fixed-list sizes and the bit width of the +/// innermost fixed-width value type. +/// /// \return The bit-width of the values or -1 /// \see FixedWidthInBytes ARROW_EXPORT int64_t FixedWidthInBits(const DataType& type); @@ -260,7 +256,7 @@ namespace internal { /// \brief Allocate an ArrayData for a type that is fixed-width like. /// /// This function performs the same checks performed by -/// `IsFixedWidthLike(source, false)`. If `source.type` is not a simple +/// `IsFixedWidthLike(source, false, false)`. If `source.type` is not a simple /// fixed-width type, caller should make sure it passes the /// `IsFixedWidthLike(source)` checks. That guarantees that it's possible to /// allocate an array that can serve as a destination for a kernel that writes values @@ -280,18 +276,24 @@ ARROW_EXPORT Status PreallocateFixedWidthArrayData(::arrow::compute::KernelConte } // namespace internal -/// \brief Get the pointer to the fixed-width values of a fixed-width like array. +/// \brief Get the 0-7 residual offset in bits and the pointer to the fixed-width +/// values of a fixed-width like array. /// -/// This function might return NULLPTR if the type of the array is BOOL or -/// if the pre-conditions listed are not satisfied. The converse is not true -/// (i.e. not getting NULLPTR doesn't guarantee that source is a fixed-width -/// like array). +/// For byte-aligned types, the offset is always 0. /// /// \pre `IsFixedWidthLike(source)` or the more restrictive /// is_fixed_width(*mutable_array->type) SHOULD be true -/// \return The pointer to the fixed-width values of an array or NULLPTR -/// if pre-conditions are not satisfied. -ARROW_EXPORT const uint8_t* OffsetPointerOfFixedWidthValues(const ArraySpan& source); +/// \return A pair with the residual offset in bits (0-7) and the pointer +/// to the fixed-width values. +ARROW_EXPORT std::pair OffsetPointerOfFixedBitWidthValues( + const ArraySpan& source); + +/// \brief Get the pointer to the fixed-width values of a fixed-width like array. +/// +/// \pre `IsFixedWidthLike(source)` should be true and BOOL should be excluded +/// as each bool is 1-bit width making it impossible to produce a +/// byte-aligned pointer to the values in the general case. +ARROW_EXPORT const uint8_t* OffsetPointerOfFixedByteWidthValues(const ArraySpan& source); /// \brief Get the mutable pointer to the fixed-width values of an array /// allocated by PreallocateFixedWidthArrayData. diff --git a/cpp/src/arrow/util/fixed_width_test.cc b/cpp/src/arrow/util/fixed_width_test.cc index 2f05221ed6535..3b35de1b6bbeb 100644 --- a/cpp/src/arrow/util/fixed_width_test.cc +++ b/cpp/src/arrow/util/fixed_width_test.cc @@ -80,10 +80,7 @@ TEST_F(TestFixedWidth, IsFixedWidth) { TEST_F(TestFixedWidth, IsFixedWidthLike) { auto arr = ArraySpan{*fsl_bool_array_->data()}; - // bools wrapped by fixed-size-list are not fixed-width because the - // innermost data buffer is a bitmap and won't byte-align. - ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false)); - ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/true)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); arr = ArraySpan{*fsl_int_array_->data()}; ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false)); @@ -114,12 +111,12 @@ TEST_F(TestFixedWidth, IsFixedWidthLike) { arr = ArraySpan{*dict_string_array_->data()}; // Dictionaries are considered fixed-width by is_fixed_width(), but excluded - // by IsFixedWidthLike if exclude_dictionary=true. + // by IsFixedWidthLike if exclude_bool_and_dictionary=true. ASSERT_TRUE(IsFixedWidthLike(arr)); - ASSERT_TRUE( - IsFixedWidthLike(arr, /*force_null_count=*/false, /*exclude_dictionary=*/false)); - ASSERT_FALSE( - IsFixedWidthLike(arr, /*force_null_count=*/false, /*exclude_dictionary=*/true)); + ASSERT_TRUE(IsFixedWidthLike(arr, /*force_null_count=*/false, + /*exclude_bool_and_dictionary=*/false)); + ASSERT_FALSE(IsFixedWidthLike(arr, /*force_null_count=*/false, + /*exclude_bool_and_dictionary=*/true)); } TEST_F(TestFixedWidth, MeasureWidthInBytes) { @@ -184,9 +181,9 @@ TEST_F(TestFixedWidth, MeasureWidthInBits) { ASSERT_EQ(FixedWidthInBits(*varlen), -1); ASSERT_EQ(FixedWidthInBits(*varlen), -1); - ASSERT_EQ(FixedWidthInBits(*fsl(0, b)), -1); - ASSERT_EQ(FixedWidthInBits(*fsl(3, b)), -1); - ASSERT_EQ(FixedWidthInBits(*fsl(5, b)), -1); + ASSERT_EQ(FixedWidthInBits(*fsl(0, b)), 0); + ASSERT_EQ(FixedWidthInBits(*fsl(3, b)), 3); + ASSERT_EQ(FixedWidthInBits(*fsl(5, b)), 5); ASSERT_EQ(FixedWidthInBits(*fsl(0, i8)), 0); ASSERT_EQ(FixedWidthInBits(*fsl(3, i8)), 3 * 8);