Eventual-Inc · colin-ho · Mar 13, 2024 · Feb 22, 2024 · Feb 23, 2024 · Feb 23, 2024
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -491,6 +491,75 @@ jobs:
         SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
         SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
 
+  integration-test-sql:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    needs:
+    - integration-test-build
+    env:
+      package-name: getdaft
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs
+        daft-runner: [py, ray]
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: true
+        fetch-depth: 0
+    - uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+        architecture: x64
+        cache: pip
+        cache-dependency-path: |
+          pyproject.toml
+          requirements-dev.txt
+    - name: Download built wheels
+      uses: actions/download-artifact@v3
+      with:
+        name: wheels
+        path: dist
+    - name: Setup Virtual Env
+      run: |
+        python -m venv venv
+        echo "$GITHUB_WORKSPACE/venv/bin" >> $GITHUB_PATH
+    - name: Install Daft and dev dependencies
+      run: |
+        pip install --upgrade pip
+        pip install -r requirements-dev.txt dist/${{ env.package-name }}-*x86_64*.whl --force-reinstall
+        rm -rf daft
+    - name: Spin up services
+      run: |
+        pushd ./tests/integration/sql/docker-compose/
+        docker-compose -f ./docker-compose.yml up -d
+        popd
+    - name: Run sql integration tests
+      run: |
+        pytest tests/integration/sql -m 'integration' --durations=50
+      env:
+        DAFT_RUNNER: ${{ matrix.daft-runner }}
+    - name: Send Slack notification on failure
+      uses: slackapi/[email protected]
+      if: ${{ failure() && (github.ref == 'refs/heads/main') }}
+      with:
+        payload: |
+          {
+            "blocks": [
+              {
+                "type": "section",
+                "text": {
+                  "type": "mrkdwn",
+                  "text": ":rotating_light: [CI] SQL Integration Tests <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|workflow> *FAILED on main* :rotating_light:"
+                }
+              }
+            ]
+          }
+      env:
+        SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+        SLACK_WEBHOOK_TYPE: INCOMING_WEBHOOK
+
   rust-tests:
     runs-on: ${{ matrix.os }}-latest
     timeout-minutes: 30

diff --git a/daft/__init__.py b/daft/__init__.py
@@ -76,6 +76,7 @@ def get_build_type() -> str:
     read_iceberg,
     read_json,
     read_parquet,
+    read_sql,
 )
 from daft.series import Series
 from daft.udf import udf
@@ -94,6 +95,7 @@ def get_build_type() -> str:
     "read_parquet",
     "read_iceberg",
     "read_delta_lake",
+    "read_sql",
     "DataCatalogType",
     "DataCatalogTable",
     "DataFrame",

diff --git a/daft/context.py b/daft/context.py
@@ -254,6 +254,7 @@ def set_execution_config(
     csv_target_filesize: int | None = None,
     csv_inflation_factor: float | None = None,
     shuffle_aggregation_default_partitions: int | None = None,
+    read_sql_partition_size_bytes: int | None = None,
 ) -> DaftContext:
     """Globally sets various configuration parameters which control various aspects of Daft execution. These configuration values
     are used when a Dataframe is executed (e.g. calls to `.write_*`, `.collect()` or `.show()`)
@@ -283,6 +284,7 @@ def set_execution_config(
         csv_target_filesize: Target File Size when writing out CSV Files. Defaults to 512MB
         csv_inflation_factor: Inflation Factor of CSV files (In-Memory-Size / File-Size) ratio. Defaults to 0.5
         shuffle_aggregation_default_partitions: Minimum number of partitions to create when performing aggregations. Defaults to 200, unless the number of input partitions is less than 200.
+        read_sql_partition_size_bytes: Target size of partition when reading from SQL databases. Defaults to 512MB
     """
     # Replace values in the DaftExecutionConfig with user-specified overrides
     ctx = get_context()
@@ -302,6 +304,7 @@ def set_execution_config(
             csv_target_filesize=csv_target_filesize,
             csv_inflation_factor=csv_inflation_factor,
             shuffle_aggregation_default_partitions=shuffle_aggregation_default_partitions,
+            read_sql_partition_size_bytes=read_sql_partition_size_bytes,
         )
 
         ctx._daft_execution_config = new_daft_execution_config

diff --git a/daft/daft.pyi b/daft/daft.pyi
@@ -217,12 +217,21 @@ class JsonSourceConfig:
         chunk_size: int | None = None,
     ): ...
 
+class DatabaseSourceConfig:
+    """
+    Configuration of a database data source.
+    """
+
+    sql: str
+
+    def __init__(self, sql: str): ...
+
 class FileFormatConfig:
     """
     Configuration for parsing a particular file format (Parquet, CSV, JSON).
     """
 
-    config: ParquetSourceConfig | CsvSourceConfig | JsonSourceConfig
+    config: ParquetSourceConfig | CsvSourceConfig | JsonSourceConfig | DatabaseSourceConfig
 
     @staticmethod
     def from_parquet_config(config: ParquetSourceConfig) -> FileFormatConfig:
@@ -242,6 +251,12 @@ class FileFormatConfig:
         Create a JSON file format config.
         """
         ...
+    @staticmethod
+    def from_database_config(config: DatabaseSourceConfig) -> FileFormatConfig:
+        """
+        Create a database file format config.
+        """
+        ...
     def file_format(self) -> FileFormat:
         """
         Get the file format for this config.
@@ -583,6 +598,20 @@ class ScanTask:
         Create a Catalog Scan Task
         """
         ...
+    @staticmethod
+    def sql_scan_task(
+        url: str,
+        file_format: FileFormatConfig,
+        schema: PySchema,
+        num_rows: int | None,
+        storage_config: StorageConfig,
+        size_bytes: int | None,
+        pushdowns: Pushdowns | None,
+    ) -> ScanTask:
+        """
+        Create a SQL Scan Task
+        """
+        ...
 
 class ScanOperatorHandle:
     """
@@ -800,6 +829,7 @@ class PyDataType:
     @staticmethod
     def python() -> PyDataType: ...
     def to_arrow(self, cast_tensor_type_for_ray: builtins.bool | None = None) -> pyarrow.DataType: ...
+    def is_numeric(self) -> builtins.bool: ...
     def is_image(self) -> builtins.bool: ...
     def is_fixed_shape_image(self) -> builtins.bool: ...
     def is_tensor(self) -> builtins.bool: ...
@@ -826,6 +856,7 @@ class PySchema:
     def names(self) -> list[str]: ...
     def union(self, other: PySchema) -> PySchema: ...
     def eq(self, other: PySchema) -> bool: ...
+    def estimate_row_size_bytes(self) -> float: ...
     @staticmethod
     def from_field_name_and_types(names_and_types: list[tuple[str, PyDataType]]) -> PySchema: ...
     @staticmethod
@@ -875,6 +906,7 @@ class PyExpr:
     def is_in(self, other: PyExpr) -> PyExpr: ...
     def name(self) -> str: ...
     def to_field(self, schema: PySchema) -> PyField: ...
+    def to_sql(self) -> str | None: ...
     def __repr__(self) -> str: ...
     def __hash__(self) -> int: ...
     def __reduce__(self) -> tuple: ...
@@ -1218,6 +1250,7 @@ class PyDaftExecutionConfig:
         csv_target_filesize: int | None = None,
         csv_inflation_factor: float | None = None,
         shuffle_aggregation_default_partitions: int | None = None,
+        read_sql_partition_size_bytes: int | None = None,
     ) -> PyDaftExecutionConfig: ...
     @property
     def scan_tasks_min_size_bytes(self) -> int: ...
@@ -1243,6 +1276,8 @@ class PyDaftExecutionConfig:
     def csv_inflation_factor(self) -> float: ...
     @property
     def shuffle_aggregation_default_partitions(self) -> int: ...
+    @property
+    def read_sql_partition_size_bytes(self) -> int: ...
 
 class PyDaftPlanningConfig:
     def with_config_values(

diff --git a/daft/datatype.py b/daft/datatype.py
@@ -375,6 +375,8 @@
             return cls.decimal128(arrow_type.precision, arrow_type.scale)
         elif pa.types.is_date32(arrow_type):
             return cls.date()
+        elif pa.types.is_date64(arrow_type):
+            return cls.timestamp(TimeUnit.ms())
         elif pa.types.is_time64(arrow_type):
             timeunit = TimeUnit.from_str(pa.type_for_alias(str(arrow_type)).unit)
             return cls.time(timeunit)
@@ -479,6 +481,9 @@
     def _is_fixed_shape_image_type(self) -> builtins.bool:
         return self._dtype.is_fixed_shape_image()
 
+    def _is_numeric_type(self) -> builtins.bool:
+        return self._dtype.is_numeric()
+
     def _is_map(self) -> builtins.bool:
         return self._dtype.is_map()
 

diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py
@@ -473,6 +473,9 @@ def name(self) -> builtins.str:
     def __repr__(self) -> builtins.str:
         return repr(self._expr)
 
+    def _to_sql(self) -> builtins.str | None:
+        return self._expr.to_sql()
+
     def _to_field(self, schema: Schema) -> Field:
         return Field._from_pyfield(self._expr.to_field(schema._schema))
 

diff --git a/daft/io/__init__.py b/daft/io/__init__.py
@@ -14,6 +14,7 @@
 from daft.io._iceberg import read_iceberg
 from daft.io._json import read_json
 from daft.io._parquet import read_parquet
+from daft.io._sql import read_sql
 from daft.io.catalog import DataCatalogTable, DataCatalogType
 from daft.io.file_path import from_glob_path
 
@@ -39,6 +40,7 @@ def _set_linux_cert_paths():
     "read_parquet",
     "read_iceberg",
     "read_delta_lake",
+    "read_sql",
     "IOConfig",
     "S3Config",
     "AzureConfig",

diff --git a/daft/io/_sql.py b/daft/io/_sql.py
@@ -0,0 +1,55 @@
+# isort: dont-add-import: from __future__ import annotations
+
+
+from typing import Optional
+
+from daft import context
+from daft.api_annotations import PublicAPI
+from daft.daft import PythonStorageConfig, ScanOperatorHandle, StorageConfig
+from daft.dataframe import DataFrame
+from daft.logical.builder import LogicalPlanBuilder
+from daft.sql.sql_scan import SQLScanOperator
+
+
+@PublicAPI
+def read_sql(
+    sql: str, url: str, partition_col: Optional[str] = None, num_partitions: Optional[int] = None
+) -> DataFrame:
+    """Creates a DataFrame from a SQL query.
+
+    Example:
+        >>> df = daft.read_sql("SELECT * FROM my_table", "sqlite:///my_database.db")
+
+    .. NOTE::
+        If partition_col is specified, this function will partition the query by the specified column. You may specify the number of partitions, or let Daft determine the number of partitions.
+        Daft will first calculate percentiles of the specified column. For example if num_partitions is 3, Daft will calculate the 33rd and 66th percentiles of the specified column, and use these values to partition the query.
+        If the database does not support the necessary SQL syntax to calculate percentiles, Daft will calculate the min and max of the specified column and partition the query into equal ranges.
+
+    Args:
+        sql (str): SQL query to execute
+        url (str): URL to the database
+        partition_col (Optional[str]): Column to partition the data by, defaults to None
+        num_partitions (Optional[int]): Number of partitions to read the data into,
+            defaults to None, which will lets Daft determine the number of partitions.
+
+    Returns:
+        DataFrame: Dataframe containing the results of the query
+    """
+
+    if num_partitions is not None and partition_col is None:
+        raise ValueError("Failed to execute sql: partition_col must be specified when num_partitions is specified")
+
+    io_config = context.get_context().daft_planning_config.default_io_config
+    storage_config = StorageConfig.python(PythonStorageConfig(io_config))
+
+    sql_operator = SQLScanOperator(
+        sql,
+        url,
+        storage_config,
+        partition_col=partition_col,
+        num_partitions=num_partitions,
+    )
+    handle = ScanOperatorHandle.from_python_scan_operator(sql_operator)
+    builder = LogicalPlanBuilder.from_tabular_scan(scan_operator=handle)
+
+    return DataFrame(builder)
diff --git a/daft/logical/schema.py b/daft/logical/schema.py
@@ -116,6 +116,9 @@
     def column_names(self) -> list[str]:
         return list(self._schema.names())
 
+    def estimate_row_size_bytes(self) -> float:
+        return self._schema.estimate_row_size_bytes()
+
     def __iter__(self) -> Iterator[Field]:
         col_names = self.column_names()
         yield from (self[name] for name in col_names)

diff --git a/daft/runners/partitioning.py b/daft/runners/partitioning.py
@@ -72,6 +72,17 @@ class TableParseParquetOptions:
     coerce_int96_timestamp_unit: TimeUnit = TimeUnit.ns()
 
 
+@dataclass(frozen=True)
+class TableReadSQLOptions:
+    """Options for parsing SQL tables
+
+    Args:
+        predicate_expression: Expression predicate to apply to the table
+    """
+
+    predicate_expression: Expression | None = None
+
+
 @dataclass(frozen=True)
 class PartialPartitionMetadata:
     num_rows: None | int

diff --git a/daft/sql/__init__.py b/daft/sql/__init__.py