diff --git a/Cargo.lock b/Cargo.lock index d8e8a5907f..9652e6da9f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -281,13 +281,11 @@ dependencies = [ "serde_derive", "serde_json", "simdutf8", - "similar", "streaming-iterator", "strength_reduce", "thiserror", "tokio", "tokio-util", - "tracing", "zstd 0.12.4", ] @@ -1810,7 +1808,6 @@ dependencies = [ "log", "pyo3", "serde", - "tracing", "typetag", ] @@ -2217,7 +2214,6 @@ dependencies = [ "pyo3", "rand 0.8.5", "serde", - "tracing", ] [[package]] @@ -4943,12 +4939,6 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f27f6278552951f1f2b8cf9da965d10969b2efdea95a6ec47987ab46edfe263a" -[[package]] -name = "similar" -version = "2.6.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e" - [[package]] name = "simple_asn1" version = "0.6.2" diff --git a/src/arrow2/Cargo.toml b/src/arrow2/Cargo.toml index b4bccdb1cf..b3862e34f0 100644 --- a/src/arrow2/Cargo.toml +++ b/src/arrow2/Cargo.toml @@ -67,12 +67,10 @@ serde_json = {version = "^1.0", features = [ ], optional = true} # For SIMD utf8 validation simdutf8 = "0.1.4" -similar = "2.6.0" streaming-iterator = {version = "0.1", optional = true} # for division/remainder optimization at runtime strength_reduce = {version = "0.2", optional = true} thiserror = {workspace = true} -tracing = "0.1.40" zstd = {version = "0.12", optional = true} # parquet support diff --git a/src/arrow2/src/array/map/mod.rs b/src/arrow2/src/array/map/mod.rs index d3cece71b3..9ffddd6fd3 100644 --- a/src/arrow2/src/array/map/mod.rs +++ b/src/arrow2/src/array/map/mod.rs @@ -203,7 +203,6 @@ impl Array for MapArray { impl_common_array!(); fn convert_logical_type(&self, target: DataType) -> Box { - tracing::trace!("converting logical type to\n{target:#?}"); let outer_is_map = matches!(target, DataType::Map { .. }); if outer_is_map { @@ -232,7 +231,7 @@ impl Array for MapArray { field.change_type(target_inner.data_type.clone()); let offsets = self.offsets().clone(); - let offsets = offsets.map(|offset| offset as i64); + let offsets = unsafe { offsets.map_unchecked(|offset| offset as i64) }; let list = ListArray::new(target, offsets, field, self.validity.clone()); diff --git a/src/arrow2/src/array/mod.rs b/src/arrow2/src/array/mod.rs index b84c015d09..deecf8bbbf 100644 --- a/src/arrow2/src/array/mod.rs +++ b/src/arrow2/src/array/mod.rs @@ -54,6 +54,17 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static { /// When the validity is [`None`], all slots are valid. fn validity(&self) -> Option<&Bitmap>; + /// Returns an iterator over the direct children of this Array. + /// + /// This method is useful for accessing child Arrays in composite types such as struct arrays. + /// By default, it returns an empty iterator, as most array types do not have child arrays. + /// + /// # Returns + /// A boxed iterator yielding mutable references to child Arrays. + /// + /// # Examples + /// For a StructArray, this would return an iterator over its field arrays. + /// For most other array types, this returns an empty iterator. fn direct_children<'a>(&'a mut self) -> Box + 'a> { Box::new(core::iter::empty()) } @@ -636,16 +647,7 @@ macro_rules! impl_common_array { fn change_type(&mut self, data_type: DataType) { if data_type.to_physical_type() != self.data_type().to_physical_type() { - let from = format!("{:#?}", self.data_type()); - let to = format!("{:#?}", data_type); - - - let diff = similar::TextDiff::from_lines(&from, &to); - - let diff = diff - .unified_diff(); - - panic!("{diff}"); + panic!("Cannot change array type from {:?} to {:?}", self.data_type(), data_type); } self.data_type = data_type.clone(); diff --git a/src/arrow2/src/datatypes/field.rs b/src/arrow2/src/datatypes/field.rs index 8239d7633a..59eb894a3e 100644 --- a/src/arrow2/src/datatypes/field.rs +++ b/src/arrow2/src/datatypes/field.rs @@ -25,8 +25,6 @@ pub struct Field { impl Field { /// Creates a new [`Field`]. pub fn new>(name: T, data_type: DataType, is_nullable: bool) -> Self { - let span = tracing::trace_span!("ArrowField::new", data_type = ?data_type, is_nullable); - let _guard = span.enter(); Field { name: name.into(), data_type, diff --git a/src/arrow2/src/datatypes/mod.rs b/src/arrow2/src/datatypes/mod.rs index d08c8fcf36..b5c5b1a8b5 100644 --- a/src/arrow2/src/datatypes/mod.rs +++ b/src/arrow2/src/datatypes/mod.rs @@ -5,13 +5,11 @@ mod field; mod physical_type; mod schema; +use std::{collections::BTreeMap, sync::Arc}; + pub use field::Field; pub use physical_type::*; pub use schema::Schema; - -use std::collections::BTreeMap; -use std::sync::Arc; - use serde::{Deserialize, Serialize}; /// typedef for [BTreeMap] denoting [`Field`]'s and [`Schema`]'s metadata. @@ -19,10 +17,12 @@ pub type Metadata = BTreeMap; /// typedef for [Option<(String, Option)>] descr pub(crate) type Extension = Option<(String, Option)>; +#[allow(unused_imports, reason = "used in documentation")] +use crate::array::Array; + pub type ArrowDataType = DataType; pub type ArrowField = Field; - /// The set of supported logical types in this crate. /// /// Each variant uniquely identifies a logical type, which define specific semantics to the data @@ -168,11 +168,46 @@ impl DataType { Self::Map(field.into(), keys_sorted) } - pub fn direct_children(&self, mut processor: impl FnMut(&DataType)) { + /// Processes the direct children data types of this DataType. + /// + /// This method is useful for traversing the structure of complex data types. + /// It calls the provided closure for each immediate child data type. + /// + /// This can be used in conjunction with the [`Array::direct_children`] method + /// to process both the data types and the corresponding array data. + /// + /// # Arguments + /// + /// * `processor` - A closure that takes a reference to a DataType as its argument. + /// + /// # Examples + /// + /// ``` + /// use arrow2::datatypes::{DataType, Field}; + /// + /// let struct_type = DataType::Struct(vec![ + /// Field::new("a", DataType::Int32, true), + /// Field::new("b", DataType::Utf8, false), + /// ]); + /// + /// let mut child_types = Vec::new(); + /// struct_type.direct_children(|child_type| { + /// child_types.push(child_type); + /// }); + /// + /// assert_eq!(child_types, vec![&DataType::Int32, &DataType::Utf8]); + /// ``` + pub fn direct_children<'a>(&'a self, mut processor: impl FnMut(&'a DataType)) { match self { - DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) | DataType::Map(field, ..) => processor(&field.data_type), - DataType::Struct(fields) | DataType::Union(fields, _, _) => fields.iter().for_each(|field| processor(&field.data_type)), - _ => {} // todo: might want to add more types here + DataType::List(field) + | DataType::FixedSizeList(field, _) + | DataType::LargeList(field) + | DataType::Map(field, ..) => processor(&field.data_type), + DataType::Struct(fields) | DataType::Union(fields, _, _) => { + fields.iter().for_each(|field| processor(&field.data_type)) + } + DataType::Dictionary(_, value_type, _) => processor(value_type), + _ => {} // Other types don't have child data types } } } diff --git a/src/arrow2/src/error.rs b/src/arrow2/src/error.rs index 3b7eaadf3e..3df1e19381 100644 --- a/src/arrow2/src/error.rs +++ b/src/arrow2/src/error.rs @@ -55,6 +55,7 @@ impl Error { Self::OutOfSpec(msg.into()) } + #[allow(unused)] pub(crate) fn nyi>(msg: A) -> Self { Self::NotYetImplemented(msg.into()) } diff --git a/src/arrow2/src/offset.rs b/src/arrow2/src/offset.rs index bbebf585b3..3d7a2aa869 100644 --- a/src/arrow2/src/offset.rs +++ b/src/arrow2/src/offset.rs @@ -1,9 +1,8 @@ //! Contains the declaration of [`Offset`] use std::hint::unreachable_unchecked; -use crate::buffer::Buffer; -use crate::error::Error; pub use crate::types::Offset; +use crate::{buffer::Buffer, error::Error}; /// A wrapper type of [`Vec`] representing the invariants of Arrow's offsets. /// It is guaranteed to (sound to assume that): @@ -71,7 +70,7 @@ impl Offsets { /// Creates a new [`Offsets`] from an iterator of lengths #[inline] - pub fn try_from_iter>(iter: I) -> Result { + pub fn try_from_iter>(iter: I) -> Result { let iterator = iter.into_iter(); let (lower, _) = iterator.size_hint(); let mut offsets = Self::with_capacity(lower); @@ -144,7 +143,9 @@ impl Offsets { /// Returns the last offset of this container. #[inline] pub fn last(&self) -> &O { - self.0.last().unwrap_or_else(|| unsafe { unreachable_unchecked() }) + self.0 + .last() + .unwrap_or_else(|| unsafe { unreachable_unchecked() }) } /// Returns a range (start, end) corresponding to the position `index` @@ -212,7 +213,7 @@ impl Offsets { /// # Errors /// This function errors iff this operation overflows for the maximum value of `O`. #[inline] - pub fn try_from_lengths>(lengths: I) -> Result { + pub fn try_from_lengths>(lengths: I) -> Result { let mut self_ = Self::with_capacity(lengths.size_hint().0); self_.try_extend_from_lengths(lengths)?; Ok(self_) @@ -222,7 +223,7 @@ impl Offsets { /// # Errors /// This function errors iff this operation overflows for the maximum value of `O`. #[inline] - pub fn try_extend_from_lengths>( + pub fn try_extend_from_lengths>( &mut self, lengths: I, ) -> Result<(), Error> { @@ -344,12 +345,34 @@ impl Default for OffsetsBuffer { } } -impl OffsetsBuffer { - pub fn map(&self, f: impl Fn(O) -> T) -> OffsetsBuffer { - let buffer = self.0.iter() - .copied() - .map(f) - .collect(); +impl OffsetsBuffer { + + /// Maps each offset to a new value, creating a new [`Self`]. + /// + /// # Safety + /// + /// This function is marked as `unsafe` because it does not check whether the resulting offsets + /// maintain the invariants required by [`OffsetsBuffer`]. The caller must ensure that: + /// + /// - The resulting offsets are monotonically increasing. + /// - The first offset is zero. + /// - All offsets are non-negative. + /// + /// Violating these invariants can lead to undefined behavior when using the resulting [`OffsetsBuffer`]. + /// + /// # Example + /// + /// ``` + /// # use arrow2::offset::OffsetsBuffer; + /// # let offsets = unsafe { OffsetsBuffer::new_unchecked(vec![0, 2, 5, 7].into()) }; + /// let doubled = unsafe { offsets.map_unchecked(|x| x * 2) }; + /// assert_eq!(doubled.buffer().as_slice(), &[0, 4, 10, 14]); + /// ``` + /// + /// Note that in this example, doubling the offsets maintains the required invariants, + /// but this may not be true for all transformations. + pub unsafe fn map_unchecked(&self, f: impl Fn(O) -> T) -> OffsetsBuffer { + let buffer = self.0.iter().copied().map(f).collect(); OffsetsBuffer(buffer) } @@ -363,7 +386,6 @@ impl OffsetsBuffer { Self(offsets) } - /// Returns an empty [`OffsetsBuffer`] (i.e. with a single element, the zero) #[inline] pub fn new() -> Self { @@ -410,7 +432,7 @@ impl OffsetsBuffer { *self.last() - *self.first() } - pub fn ranges(&self) -> impl Iterator> + '_ { + pub fn ranges(&self) -> impl Iterator> + '_ { self.0.windows(2).map(|w| { let from = w[0]; let to = w[1]; @@ -419,17 +441,20 @@ impl OffsetsBuffer { }) } - /// Returns the first offset. #[inline] pub fn first(&self) -> &O { - self.0.first().unwrap_or_else(|| unsafe { unreachable_unchecked() }) + self.0 + .first() + .unwrap_or_else(|| unsafe { unreachable_unchecked() }) } /// Returns the last offset. #[inline] pub fn last(&self) -> &O { - self.0.last().unwrap_or_else(|| unsafe { unreachable_unchecked() }) + self.0 + .last() + .unwrap_or_else(|| unsafe { unreachable_unchecked() }) } /// Returns a range (start, end) corresponding to the position `index` @@ -473,7 +498,7 @@ impl OffsetsBuffer { /// Returns an iterator with the lengths of the offsets #[inline] - pub fn lengths(&self) -> impl Iterator + '_ { + pub fn lengths(&self) -> impl Iterator + '_ { self.0.windows(2).map(|w| (w[1] - w[0]).to_usize()) } diff --git a/src/daft-core/src/series/from.rs b/src/daft-core/src/series/from.rs index c4ae727bab..fb30db3a93 100644 --- a/src/daft-core/src/series/from.rs +++ b/src/daft-core/src/series/from.rs @@ -97,7 +97,7 @@ mod tests { } #[test] - fn test_() -> DaftResult<()> { + fn test_map_array_conversion() -> DaftResult<()> { use arrow2::array::MapArray; use super::*; diff --git a/src/daft-dsl/Cargo.toml b/src/daft-dsl/Cargo.toml index c15bc6c61f..cc72281e2e 100644 --- a/src/daft-dsl/Cargo.toml +++ b/src/daft-dsl/Cargo.toml @@ -13,7 +13,6 @@ itertools = {workspace = true} log = {workspace = true} pyo3 = {workspace = true, optional = true} serde = {workspace = true} -tracing = "0.1.40" typetag = "0.2.16" [features] diff --git a/src/daft-dsl/src/functions/map/get.rs b/src/daft-dsl/src/functions/map/get.rs index c833ebabf4..5465f08562 100644 --- a/src/daft-dsl/src/functions/map/get.rs +++ b/src/daft-dsl/src/functions/map/get.rs @@ -31,8 +31,6 @@ impl FunctionEvaluator for GetEvaluator { let field = Field::new("value", *value); - tracing::debug!("Field: {:?}", field); - Ok(field) } diff --git a/src/daft-table/Cargo.toml b/src/daft-table/Cargo.toml index 4b1287e33b..5682fa41a2 100644 --- a/src/daft-table/Cargo.toml +++ b/src/daft-table/Cargo.toml @@ -13,7 +13,6 @@ num-traits = {workspace = true} pyo3 = {workspace = true, optional = true} rand = {workspace = true} serde = {workspace = true} -tracing = "0.1.40" [features] python = ["dep:pyo3", "common-error/python", "daft-core/python", "daft-dsl/python", "common-arrow-ffi/python", "common-display/python", "daft-image/python"] diff --git a/src/daft-table/src/lib.rs b/src/daft-table/src/lib.rs index ca1a16bcc2..6f87fd6d49 100644 --- a/src/daft-table/src/lib.rs +++ b/src/daft-table/src/lib.rs @@ -497,9 +497,6 @@ impl Table { fn eval_expression(&self, expr: &Expr) -> DaftResult { use crate::Expr::*; - let span = tracing::trace_span!("DataFrame::eval_expression", expr = ?expr); - let _guard = span.enter(); - let expected_field = expr.to_field(self.schema.as_ref())?; let series = match expr { Alias(child, name) => Ok(self.eval_expression(child)?.rename(name)), @@ -578,8 +575,6 @@ impl Table { }, }?; - tracing::trace!("Series of {expr:?} -> {series:#?}"); - if expected_field.name != series.field().name { return Err(DaftError::ComputeError(format!( "Mismatch of expected expression name and name from computed series ({} vs {}) for expression: {expr}", diff --git a/tests/io/test_parquet_roundtrip.py b/tests/io/test_parquet_roundtrip.py index 99ea54b64c..292c5b98e1 100644 --- a/tests/io/test_parquet_roundtrip.py +++ b/tests/io/test_parquet_roundtrip.py @@ -132,14 +132,6 @@ def test_roundtrip_tensor_types(tmp_path): # Read the Parquet file back into a new DataFrame df_roundtrip = daft.read_parquet(str(tmp_path)) - # Print the schema of the original DataFrame - print("Original DataFrame Schema:") - print(df_original.schema()) - - # Print the schema of the DataFrame after roundtrip - print("\nRoundtrip DataFrame Schema:") - print(df_roundtrip.schema()) - # Verify that the data type is preserved after the roundtrip assert df_roundtrip.schema()["tensor_col"].dtype == expected_tensor_dtype diff --git a/tests/table/map/test_map_get.py b/tests/table/map/test_map_get.py index e556a158bb..053d8fed29 100644 --- a/tests/table/map/test_map_get.py +++ b/tests/table/map/test_map_get.py @@ -49,21 +49,7 @@ def test_map_get_logical_type(): ) table = MicroPartition.from_arrow(pa.table({"map_col": data})) - print("table", table) - - print("Key type:", data.type.key_type) - print("Item type:", data.type.item_type) - - # Get the first element of the table - first_elem = table.slice(0, 1) - print("First element of table:", first_elem) - - # Get the 'map_col' from the first element - first_map = first_elem.get_column("map_col") - print("First map:", type(first_map)) - map = col("map_col").map - result = table.eval_expression_list([map.get("foo")]) assert result.to_pydict() == {"value": [datetime.date(2022, 1, 1), datetime.date(2022, 1, 2), None]}