-
Notifications
You must be signed in to change notification settings - Fork 174
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Makes swordfish a top level runner, `set_runner_native()`. Additionally sets swordfish to be the default runner for development. This PR also contains some bug fixes and test changes, of which I have left comments for. Additionally, this PR refactors swordfish in two ways: 1. Buffers scan tasks based on a `num_parallel_tasks` parameter, that takes into account any pushed down limits. 2. Adds an `is_err` check on the sender in parts of the code where we have a `while receiver.recv.await -> sender.send` pattern, such that it breaks out of the loop if the sender is dropped. This is needed in cases when the consumer is done receiving data, such as in a Limit, or if the user is doing `iter(df)` and breaks out of the iter, which will cause receivers to be dropped. As such, the senders should recognize this and drop as well. --------- Co-authored-by: Colin Ho <[email protected]> Co-authored-by: Colin Ho <[email protected]>
- Loading branch information
1 parent
2b71ffb
commit 6e28b3f
Showing
38 changed files
with
425 additions
and
228 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,20 +25,21 @@ jobs: | |
fail-fast: false | ||
matrix: | ||
python-version: ['3.8', '3.10'] | ||
daft-runner: [py, ray] | ||
daft-runner: [py, ray, native] | ||
pyarrow-version: [7.0.0, 16.0.0] | ||
enable-native-executor: [0, 1] | ||
os: [ubuntu-20.04, windows-latest] | ||
exclude: | ||
- daft-runner: ray | ||
enable-native-executor: 1 | ||
- daft-runner: ray | ||
pyarrow-version: 7.0.0 | ||
os: ubuntu-20.04 | ||
- daft-runner: py | ||
python-version: '3.10' | ||
pyarrow-version: 7.0.0 | ||
os: ubuntu-20.04 | ||
- daft-runner: native | ||
python-version: '3.10' | ||
pyarrow-version: 7.0.0 | ||
os: ubuntu-20.04 | ||
- python-version: '3.8' | ||
pyarrow-version: 16.0.0 | ||
- os: windows-latest | ||
|
@@ -118,7 +119,6 @@ jobs: | |
CARGO_TARGET_DIR: ./target | ||
|
||
DAFT_RUNNER: ${{ matrix.daft-runner }} | ||
DAFT_ENABLE_NATIVE_EXECUTOR: ${{ matrix.enable-native-executor }} | ||
|
||
- name: Build library and Test with pytest (Windows) | ||
if: ${{ (runner.os == 'Windows') }} | ||
|
@@ -220,7 +220,7 @@ jobs: | |
fail-fast: false | ||
matrix: | ||
python-version: ['3.8'] | ||
daft-runner: [py, ray] | ||
daft-runner: [py, ray, native] | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
|
@@ -295,7 +295,7 @@ jobs: | |
fail-fast: false | ||
matrix: | ||
python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs | ||
daft-runner: [py, ray] | ||
daft-runner: [py, ray, native] | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
|
@@ -373,7 +373,7 @@ jobs: | |
fail-fast: false | ||
matrix: | ||
python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs | ||
daft-runner: [py, ray] | ||
daft-runner: [py, ray, native] | ||
# These permissions are needed to interact with GitHub's OIDC Token endpoint. | ||
# This is used in the step "Assume GitHub Actions AWS Credentials" | ||
permissions: | ||
|
@@ -467,11 +467,7 @@ jobs: | |
fail-fast: false | ||
matrix: | ||
python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs | ||
daft-runner: [py, ray] | ||
enable-native-executor: [0, 1] | ||
exclude: | ||
- daft-runner: ray | ||
enable-native-executor: 1 | ||
daft-runner: [py, ray, native] | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
|
@@ -517,7 +513,6 @@ jobs: | |
pytest tests/integration/iceberg -m 'integration' --durations=50 | ||
env: | ||
DAFT_RUNNER: ${{ matrix.daft-runner }} | ||
DAFT_ENABLE_NATIVE_EXECUTOR: ${{ matrix.enable-native-executor }} | ||
- name: Send Slack notification on failure | ||
uses: slackapi/[email protected] | ||
if: ${{ failure() && (github.ref == 'refs/heads/main') }} | ||
|
@@ -549,11 +544,7 @@ jobs: | |
fail-fast: false | ||
matrix: | ||
python-version: ['3.8'] # can't use 3.7 due to requiring anon mode for adlfs | ||
daft-runner: [py, ray] | ||
enable-native-executor: [0, 1] | ||
exclude: | ||
- daft-runner: ray | ||
enable-native-executor: 1 | ||
daft-runner: [py, ray, native] | ||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
|
@@ -598,7 +589,6 @@ jobs: | |
pytest tests/integration/sql -m 'integration or not integration' --durations=50 | ||
env: | ||
DAFT_RUNNER: ${{ matrix.daft-runner }} | ||
DAFT_ENABLE_NATIVE_EXECUTOR: ${{ matrix.enable-native-executor }} | ||
- name: Send Slack notification on failure | ||
uses: slackapi/[email protected] | ||
if: ${{ failure() && (github.ref == 'refs/heads/main') }} | ||
|
@@ -681,6 +671,8 @@ jobs: | |
uses: CodSpeedHQ/action@v3 | ||
with: | ||
run: pytest tests/benchmarks -m benchmark --codspeed | ||
env: | ||
DAFT_RUNNER: native | ||
- name: Send Slack notification on failure | ||
uses: slackapi/[email protected] | ||
if: ${{ failure() && (github.ref == 'refs/heads/main') }} | ||
|
@@ -810,6 +802,8 @@ jobs: | |
source activate | ||
maturin develop | ||
pytest --doctest-modules --continue-on-collection-errors daft/dataframe/dataframe.py daft/expressions/expressions.py daft/convert.py daft/udf.py | ||
env: | ||
DAFT_RUNNER: py | ||
|
||
publish-coverage-reports: | ||
name: Publish coverage reports to CodeCov | ||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
from __future__ import annotations | ||
|
||
import logging | ||
from typing import TYPE_CHECKING, Iterator | ||
|
||
from daft.context import get_context | ||
from daft.daft import FileFormatConfig, FileInfos, IOConfig | ||
from daft.execution.native_executor import NativeExecutor | ||
from daft.filesystem import glob_path_with_stats | ||
from daft.runners import runner_io | ||
from daft.runners.partitioning import ( | ||
LocalMaterializedResult, | ||
LocalPartitionSet, | ||
PartitionCacheEntry, | ||
PartitionSetCache, | ||
) | ||
from daft.runners.runner import LOCAL_PARTITION_SET_CACHE, Runner | ||
from daft.table import MicroPartition | ||
|
||
if TYPE_CHECKING: | ||
from daft.logical.builder import LogicalPlanBuilder | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class NativeRunnerIO(runner_io.RunnerIO): | ||
def glob_paths_details( | ||
self, | ||
source_paths: list[str], | ||
file_format_config: FileFormatConfig | None = None, | ||
io_config: IOConfig | None = None, | ||
) -> FileInfos: | ||
file_infos = FileInfos() | ||
file_format = file_format_config.file_format() if file_format_config is not None else None | ||
for source_path in source_paths: | ||
path_file_infos = glob_path_with_stats(source_path, file_format, io_config) | ||
|
||
if len(path_file_infos) == 0: | ||
raise FileNotFoundError(f"No files found at {source_path}") | ||
|
||
file_infos.extend(path_file_infos) | ||
|
||
return file_infos | ||
|
||
|
||
class NativeRunner(Runner[MicroPartition]): | ||
def __init__(self) -> None: | ||
super().__init__() | ||
|
||
def initialize_partition_set_cache(self) -> PartitionSetCache: | ||
return LOCAL_PARTITION_SET_CACHE | ||
|
||
def runner_io(self) -> NativeRunnerIO: | ||
return NativeRunnerIO() | ||
|
||
def run(self, builder: LogicalPlanBuilder) -> PartitionCacheEntry: | ||
results = list(self.run_iter(builder)) | ||
|
||
result_pset = LocalPartitionSet() | ||
for i, result in enumerate(results): | ||
result_pset.set_partition(i, result) | ||
|
||
pset_entry = self.put_partition_set_into_cache(result_pset) | ||
return pset_entry | ||
|
||
def run_iter( | ||
self, | ||
builder: LogicalPlanBuilder, | ||
results_buffer_size: int | None = None, | ||
) -> Iterator[LocalMaterializedResult]: | ||
# NOTE: Freeze and use this same execution config for the entire execution | ||
daft_execution_config = get_context().daft_execution_config | ||
|
||
# Optimize the logical plan. | ||
builder = builder.optimize() | ||
executor = NativeExecutor.from_logical_plan_builder(builder) | ||
results_gen = executor.run( | ||
{k: v.values() for k, v in self._part_set_cache.get_all_partition_sets().items()}, | ||
daft_execution_config, | ||
results_buffer_size, | ||
) | ||
yield from results_gen | ||
|
||
def run_iter_tables( | ||
self, builder: LogicalPlanBuilder, results_buffer_size: int | None = None | ||
) -> Iterator[MicroPartition]: | ||
for result in self.run_iter(builder, results_buffer_size=results_buffer_size): | ||
yield result.partition() |
Oops, something went wrong.