Merge branch 'main' into unity-azure-support

Eventual-Inc · Oct 10, 2024 · c4ad0dd · c4ad0dd
2 parents eac67c3 + 73ff3f3
commit c4ad0dd
Show file tree

Hide file tree

Showing 55 changed files with 1,483 additions and 856 deletions.
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -545,6 +545,10 @@ jobs:
       matrix:
         python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs
         daft-runner: [py, ray]
+        enable-native-executor: [0, 1]
+        exclude:
+        - daft-runner: ray
+          enable-native-executor: 1
     steps:
     - uses: actions/checkout@v4
       with:
@@ -583,6 +587,7 @@ jobs:
         pytest tests/integration/sql -m 'integration or not integration' --durations=50
       env:
         DAFT_RUNNER: ${{ matrix.daft-runner }}
+        DAFT_ENABLE_NATIVE_EXECUTOR: ${{ matrix.enable-native-executor }}
     - name: Send Slack notification on failure
       uses: slackapi/[email protected]
       if: ${{ failure() && (github.ref == 'refs/heads/main') }}

diff --git a/daft/daft/__init__.pyi b/daft/daft/__init__.pyi
@@ -1051,6 +1051,7 @@ class PyExpr:
     def approx_count_distinct(self) -> PyExpr: ...
     def approx_percentiles(self, percentiles: float | list[float]) -> PyExpr: ...
     def mean(self) -> PyExpr: ...
+    def stddev(self) -> PyExpr: ...
     def min(self) -> PyExpr: ...
     def max(self) -> PyExpr: ...
     def any_value(self, ignore_nulls: bool) -> PyExpr: ...
@@ -1336,6 +1337,7 @@ class PySeries:
     def count(self, mode: CountMode) -> PySeries: ...
     def sum(self) -> PySeries: ...
     def mean(self) -> PySeries: ...
+    def stddev(self) -> PySeries: ...
     def min(self) -> PySeries: ...
     def max(self) -> PySeries: ...
     def agg_list(self) -> PySeries: ...

diff --git a/daft/dataframe/dataframe.py b/daft/dataframe/dataframe.py
@@ -2118,6 +2118,33 @@ def mean(self, *cols: ColumnInputType) -> "DataFrame":
         """
         return self._apply_agg_fn(Expression.mean, cols)
 
+    @DataframePublicAPI
+    def stddev(self, *cols: ColumnInputType) -> "DataFrame":
+        """Performs a global standard deviation on the DataFrame
+
+        Example:
+            >>> import daft
+            >>> df = daft.from_pydict({"col_a":[0,1,2]})
+            >>> df = df.stddev("col_a")
+            >>> df.show()
+            ╭───────────────────╮
+            │ col_a             │
+            │ ---               │
+            │ Float64           │
+            ╞═══════════════════╡
+            │ 0.816496580927726 │
+            ╰───────────────────╯
+            <BLANKLINE>
+            (Showing first 1 of 1 rows)
+
+
+        Args:
+            *cols (Union[str, Expression]): columns to stddev
+        Returns:
+            DataFrame: Globally aggregated standard deviation. Should be a single row.
+        """
+        return self._apply_agg_fn(Expression.stddev, cols)
+
     @DataframePublicAPI
     def min(self, *cols: ColumnInputType) -> "DataFrame":
         """Performs a global min on the DataFrame
@@ -2856,6 +2883,34 @@ def mean(self, *cols: ColumnInputType) -> "DataFrame":
         """
         return self.df._apply_agg_fn(Expression.mean, cols, self.group_by)
 
+    def stddev(self, *cols: ColumnInputType) -> "DataFrame":
+        """Performs grouped standard deviation on this GroupedDataFrame.
+
+        Example:
+            >>> import daft
+            >>> df = daft.from_pydict({"keys": ["a", "a", "a", "b"], "col_a": [0,1,2,100]})
+            >>> df = df.groupby("keys").stddev()
+            >>> df.show()
+            ╭──────┬───────────────────╮
+            │ keys ┆ col_a             │
+            │ ---  ┆ ---               │
+            │ Utf8 ┆ Float64           │
+            ╞══════╪═══════════════════╡
+            │ a    ┆ 0.816496580927726 │
+            ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┤
+            │ b    ┆ 0                 │
+            ╰──────┴───────────────────╯
+            <BLANKLINE>
+            (Showing first 2 of 2 rows)
+
+        Args:
+            *cols (Union[str, Expression]): columns to stddev
+
+        Returns:
+            DataFrame: DataFrame with grouped standard deviation.
+        """
+        return self.df._apply_agg_fn(Expression.stddev, cols, self.group_by)
+
     def min(self, *cols: ColumnInputType) -> "DataFrame":
         """Perform grouped min on this GroupedDataFrame.
 

diff --git a/daft/delta_lake/delta_lake_scan.py b/daft/delta_lake/delta_lake_scan.py
@@ -23,12 +23,15 @@
 
 if TYPE_CHECKING:
     from collections.abc import Iterator
+    from datetime import datetime
 
 logger = logging.getLogger(__name__)
 
 
 class DeltaLakeScanOperator(ScanOperator):
-    def __init__(self, table_uri: str, storage_config: StorageConfig) -> None:
+    def __init__(
+        self, table_uri: str, storage_config: StorageConfig, version: int | str | datetime | None = None
+    ) -> None:
         super().__init__()
 
         # Unfortunately delta-rs doesn't do very good inference of credentials for S3. Thus the current Daft behavior of passing
@@ -76,6 +79,9 @@ def __init__(self, table_uri: str, storage_config: StorageConfig) -> None:
             table_uri, storage_options=io_config_to_storage_options(deltalake_sdk_io_config, table_uri)
         )
 
+        if version is not None:
+            self._table.load_as_version(version)
+
         self._storage_config = storage_config
         self._schema = Schema.from_pyarrow_schema(self._table.schema().to_pyarrow())
         partition_columns = set(self._table.metadata().partition_columns)

diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -862,6 +862,11 @@ def mean(self) -> Expression:
         expr = self._expr.mean()
         return Expression._from_pyexpr(expr)
 
+    def stddev(self) -> Expression:
+        """Calculates the standard deviation of the values in the expression"""
+        expr = self._expr.stddev()
+        return Expression._from_pyexpr(expr)
+
     def min(self) -> Expression:
         """Calculates the minimum value in the expression"""
         expr = self._expr.min()

diff --git a/daft/io/_deltalake.py b/daft/io/_deltalake.py
@@ -11,12 +11,15 @@
 from daft.logical.builder import LogicalPlanBuilder
 
 if TYPE_CHECKING:
+    from datetime import datetime
+
     from daft.unity_catalog import UnityCatalogTable
 
 
 @PublicAPI
 def read_deltalake(
     table: Union[str, DataCatalogTable, "UnityCatalogTable"],
+    version: Optional[Union[int, str, "datetime"]] = None,
     io_config: Optional["IOConfig"] = None,
     _multithreaded_io: Optional[bool] = None,
 ) -> DataFrame:
@@ -37,8 +40,11 @@ def read_deltalake(
     Args:
         table: Either a URI for the Delta Lake table or a :class:`~daft.io.catalog.DataCatalogTable` instance
             referencing a table in a data catalog, such as AWS Glue Data Catalog or Databricks Unity Catalog.
-        io_config: A custom :class:`~daft.daft.IOConfig` to use when accessing Delta Lake object storage data. Defaults to None.
-        _multithreaded_io: Whether to use multithreading for IO threads. Setting this to False can be helpful in reducing
+        version (optional): If int is passed, read the table with specified version number. Otherwise if string or datetime,
+            read the timestamp version of the table. Strings must be RFC 3339 and ISO 8601 date and time format.
+            Datetimes are assumed to be UTC timezone unless specified. By default, read the latest version of the table.
+        io_config (optional): A custom :class:`~daft.daft.IOConfig` to use when accessing Delta Lake object storage data. Defaults to None.
+        _multithreaded_io (optional): Whether to use multithreading for IO threads. Setting this to False can be helpful in reducing
             the amount of system resources (number of connections and thread contention) when running in the Ray runner.
             Defaults to None, which will let Daft decide based on the runner it is currently using.
 
@@ -69,7 +75,7 @@ def read_deltalake(
         raise ValueError(
             f"table argument must be a table URI string, DataCatalogTable or UnityCatalogTable instance, but got: {type(table)}, {table}"
         )
-    delta_lake_operator = DeltaLakeScanOperator(table_uri, storage_config=storage_config)
+    delta_lake_operator = DeltaLakeScanOperator(table_uri, storage_config=storage_config, version=version)
 
     handle = ScanOperatorHandle.from_python_scan_operator(delta_lake_operator)
     builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)

diff --git a/daft/series.py b/daft/series.py
@@ -512,6 +512,10 @@ def mean(self) -> Series:
         assert self._series is not None
         return Series._from_pyseries(self._series.mean())
 
+    def stddev(self) -> Series:
+        assert self._series is not None
+        return Series._from_pyseries(self._series.stddev())
+
     def sum(self) -> Series:
         assert self._series is not None
         return Series._from_pyseries(self._series.sum())

diff --git a/docs/source/api_docs/dataframe.rst b/docs/source/api_docs/dataframe.rst
@@ -104,6 +104,7 @@ Aggregations
     DataFrame.groupby
     DataFrame.sum
     DataFrame.mean
+    DataFrame.stddev
     DataFrame.count
     DataFrame.min
     DataFrame.max

diff --git a/docs/source/api_docs/expressions.rst b/docs/source/api_docs/expressions.rst
@@ -113,6 +113,7 @@ The following can be used with DataFrame.agg or GroupedDataFrame.agg
    Expression.count
    Expression.sum
    Expression.mean
+   Expression.stddev
    Expression.min
    Expression.max
    Expression.any_value

diff --git a/src/daft-core/src/array/fixed_size_list_array.rs b/src/daft-core/src/array/fixed_size_list_array.rs
@@ -64,6 +64,13 @@ impl FixedSizeListArray {
         self.validity.as_ref()
     }
 
+    pub fn null_count(&self) -> usize {
+        match self.validity() {
+            None => 0,
+            Some(validity) => validity.unset_bits(),
+        }
+    }
+
     pub fn concat(arrays: &[&Self]) -> DaftResult<Self> {
         if arrays.is_empty() {
             return Err(DaftError::ValueError(

diff --git a/src/daft-core/src/array/ops/list_agg.rs b/src/daft-core/src/array/ops/list_agg.rs
@@ -10,52 +10,72 @@ use crate::{
     series::IntoSeries,
 };
 
+macro_rules! impl_daft_list_agg {
+    () => {
+        type Output = DaftResult<ListArray>;
+
+        fn list(&self) -> Self::Output {
+            let child_series = self.clone().into_series();
+            let offsets =
+                arrow2::offset::OffsetsBuffer::try_from(vec![0, child_series.len() as i64])?;
+            let list_field = self.field.to_list_field()?;
+            Ok(ListArray::new(list_field, child_series, offsets, None))
+        }
+
+        fn grouped_list(&self, groups: &GroupIndices) -> Self::Output {
+            let mut offsets = Vec::with_capacity(groups.len() + 1);
+
+            offsets.push(0);
+            for g in groups {
+                offsets.push(offsets.last().unwrap() + g.len() as i64);
+            }
+
+            let total_capacity = *offsets.last().unwrap();
+
+            let mut growable: Box<dyn Growable> = Box::new(Self::make_growable(
+                self.name(),
+                self.data_type(),
+                vec![self],
+                self.null_count() > 0,
+                total_capacity as usize,
+            ));
+
+            for g in groups {
+                for idx in g {
+                    growable.extend(0, *idx as usize, 1);
+                }
+            }
+            let list_field = self.field.to_list_field()?;
+
+            Ok(ListArray::new(
+                list_field,
+                growable.build()?,
+                arrow2::offset::OffsetsBuffer::try_from(offsets)?,
+                None,
+            ))
+        }
+    };
+}
+
 impl<T> DaftListAggable for DataArray<T>
 where
     T: DaftArrowBackedType,
     Self: IntoSeries,
     Self: GrowableArray,
 {
-    type Output = DaftResult<ListArray>;
-    fn list(&self) -> Self::Output {
-        let child_series = self.clone().into_series();
-        let offsets = arrow2::offset::OffsetsBuffer::try_from(vec![0, child_series.len() as i64])?;
-        let list_field = self.field.to_list_field()?;
-        Ok(ListArray::new(list_field, child_series, offsets, None))
-    }
-
-    fn grouped_list(&self, groups: &GroupIndices) -> Self::Output {
-        let mut offsets = Vec::with_capacity(groups.len() + 1);
-
-        offsets.push(0);
-        for g in groups {
-            offsets.push(offsets.last().unwrap() + g.len() as i64);
-        }
+    impl_daft_list_agg!();
+}
 
-        let total_capacity = *offsets.last().unwrap();
+impl DaftListAggable for ListArray {
+    impl_daft_list_agg!();
+}
 
-        let mut growable: Box<dyn Growable> = Box::new(Self::make_growable(
-            self.name(),
-            self.data_type(),
-            vec![self],
-            self.data.null_count() > 0,
-            total_capacity as usize,
-        ));
+impl DaftListAggable for FixedSizeListArray {
+    impl_daft_list_agg!();
+}
 
-        for g in groups {
-            for idx in g {
-                growable.extend(0, *idx as usize, 1);
-            }
-        }
-        let list_field = self.field.to_list_field()?;
-
-        Ok(ListArray::new(
-            list_field,
-            growable.build()?,
-            arrow2::offset::OffsetsBuffer::try_from(offsets)?,
-            None,
-        ))
-    }
+impl DaftListAggable for StructArray {
+    impl_daft_list_agg!();
 }
 
 #[cfg(feature = "python")]
@@ -95,45 +115,3 @@ impl DaftListAggable for crate::datatypes::PythonArray {
         Self::new(self.field().clone().into(), Box::new(arrow_array))
     }
 }
-
-impl DaftListAggable for ListArray {
-    type Output = DaftResult<Self>;
-
-    fn list(&self) -> Self::Output {
-        // TODO(FixedSizeList)
-        todo!("Requires new ListArrays for implementation")
-    }
-
-    fn grouped_list(&self, _groups: &GroupIndices) -> Self::Output {
-        // TODO(FixedSizeList)
-        todo!("Requires new ListArrays for implementation")
-    }
-}
-
-impl DaftListAggable for FixedSizeListArray {
-    type Output = DaftResult<ListArray>;
-
-    fn list(&self) -> Self::Output {
-        // TODO(FixedSizeList)
-        todo!("Requires new ListArrays for implementation")
-    }
-
-    fn grouped_list(&self, _groups: &GroupIndices) -> Self::Output {
-        // TODO(FixedSizeList)
-        todo!("Requires new ListArrays for implementation")
-    }
-}
-
-impl DaftListAggable for StructArray {
-    type Output = DaftResult<ListArray>;
-
-    fn list(&self) -> Self::Output {
-        // TODO(FixedSizeList)
-        todo!("Requires new ListArrays for implementation")
-    }
-
-    fn grouped_list(&self, _groups: &GroupIndices) -> Self::Output {
-        // TODO(FixedSizeList)
-        todo!("Requires new ListArrays for implementation")
-    }
-}