Skip to content

Commit

Permalink
Merge branch main into colin/swordfish-outer-join
Browse files Browse the repository at this point in the history
  • Loading branch information
Colin Ho authored and Colin Ho committed Sep 25, 2024
2 parents 94245f8 + 412dfbc commit 9db56c3
Show file tree
Hide file tree
Showing 395 changed files with 7,449 additions and 4,581 deletions.
38 changes: 0 additions & 38 deletions .github/ISSUE_TEMPLATE/bug_report.md

This file was deleted.

45 changes: 45 additions & 0 deletions .github/ISSUE_TEMPLATE/bug_report.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
name: Bug report
description: Create a report to help us improve Daft
labels: [bug, needs triage]
body:
- type: textarea
attributes:
label: Describe the bug
description: Describe the bug.
placeholder: >
A clear and concise description of what the bug is.
validations:
required: true
- type: textarea
attributes:
label: To Reproduce
placeholder: >
Steps to reproduce the behavior:
- type: textarea
attributes:
label: Expected behavior
placeholder: >
A clear and concise description of what you expected to happen.
- type: dropdown
id: component
attributes:
label: Component(s)
multiple: true
options:
- Expressions
- SQL
- Python Runner
- Ray Runner
- Parquet
- CSV
- Continuous Integration
- Developer Tools
- Documentation
- Other
validations:
required: true
- type: textarea
attributes:
label: Additional context
placeholder: >
Add any other context about the problem here.
6 changes: 3 additions & 3 deletions .github/workflows/nightlies-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
fetch-depth: 0

- name: Install uv
uses: astral-sh/setup-uv@v2
uses: astral-sh/setup-uv@v3
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v5
with:
Expand Down Expand Up @@ -103,7 +103,7 @@ jobs:
if: runner.os == 'macos'

- name: Install uv
uses: astral-sh/setup-uv@v2
uses: astral-sh/setup-uv@v3
- name: Set up Python ${{ env.PYTHON_VERSION }}
uses: actions/setup-python@v5
with:
Expand All @@ -127,7 +127,7 @@ jobs:
role-to-assume: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
role-session-name: DaftPythonPackageGitHubWorkflow
- name: Spin up IO services
uses: isbang/[email protected].0
uses: isbang/[email protected].2
with:
compose-file: ./tests/integration/io/docker-compose/docker-compose.yml
down-flags: --volumes
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ jobs:
mkdir -p /tmp/daft-integration-testing/nginx
chmod +rw /tmp/daft-integration-testing/nginx
- name: Spin up IO services
uses: isbang/[email protected].0
uses: isbang/[email protected].2
with:
compose-file: ./tests/integration/io/docker-compose/docker-compose.yml
down-flags: --volumes
Expand Down Expand Up @@ -427,7 +427,7 @@ jobs:
# workload_identity_provider: ${{ secrets.ACTIONS_GCP_WORKLOAD_IDENTITY_PROVIDER }}
# service_account: ${{ secrets.ACTIONS_GCP_SERVICE_ACCOUNT }}
- name: Spin up IO services
uses: isbang/[email protected].0
uses: isbang/[email protected].2
with:
compose-file: ./tests/integration/io/docker-compose/docker-compose.yml
down-flags: --volumes
Expand Down
26 changes: 24 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 7 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ publish = false
version = "0.3.0-dev0"

[package.metadata.cargo-machete]
ignored = ["lzma_sys"]
ignored = ["lzma-sys"]

[patch.crates-io]
arrow2 = {path = "src/arrow2"}
Expand Down Expand Up @@ -149,6 +149,7 @@ bytes = "1.6.0"
chrono = "0.4.38"
chrono-tz = "0.8.4"
comfy-table = "7.1.1"
derivative = "2.2.0"
dyn-clone = "1"
futures = "0.3.30"
html-escape = "0.2.13"
Expand All @@ -161,6 +162,7 @@ jaq-std = "1.2.0"
num-derive = "0.3.3"
num-traits = "0.2"
once_cell = "1.19.0"
path_macro = "1.0.0"
pretty_assertions = "1.4.0"
rand = "^0.8"
rayon = "1.10.0"
Expand All @@ -169,7 +171,7 @@ rstest = "0.18.2"
serde_json = "1.0.116"
sketches-ddsketch = {version = "0.2.2", features = ["use_serde"]}
snafu = {version = "0.7.4", features = ["futures"]}
sqlparser = "0.49.0"
sqlparser = "0.51.0"
sysinfo = "0.30.12"
test-log = "0.2.16"
thiserror = "1.0.63"
Expand Down Expand Up @@ -222,6 +224,9 @@ version = "0.11.0"
features = ["derive", "rc"]
version = "1.0.200"

[workspace.lints.clippy]
use-self = "deny"

[workspace.package]
edition = "2021"
version = "0.3.0-dev0"
26 changes: 18 additions & 8 deletions benchmarking/tpch/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def _get_df(table_name: str) -> DataFrame:

def run_all_benchmarks(
parquet_folder: str,
skip_questions: set[int],
questions: list[int],
csv_output_location: str | None,
ray_job_dashboard_url: str | None = None,
requirements: str | None = None,
Expand All @@ -133,11 +133,7 @@ def run_all_benchmarks(
daft_context = get_context()
metrics_builder = MetricsBuilder(daft_context.runner_config.name)

for i in range(1, 23):
if i in skip_questions:
logger.warning("Skipping TPC-H q%s", i)
continue

for i in questions:
# Run as a Ray Job if dashboard URL is provided
if ray_job_dashboard_url is not None:
from benchmarking.tpch import ray_job_runner
Expand Down Expand Up @@ -202,7 +198,10 @@ def get_ray_runtime_env(requirements: str | None) -> dict:
runtime_env = {
"py_modules": [daft],
"eager_install": True,
"env_vars": {"DAFT_PROGRESS_BAR": "0"},
"env_vars": {
"DAFT_PROGRESS_BAR": "0",
"DAFT_RUNNER": "ray",
},
}
if requirements:
runtime_env.update({"pip": requirements})
Expand Down Expand Up @@ -266,6 +265,7 @@ def warm_up_function():
parser.add_argument(
"--num_parts", default=None, help="Number of parts to generate (defaults to 1 part per GB)", type=int
)
parser.add_argument("--questions", type=str, default=None, help="Comma-separated list of questions to run")
parser.add_argument("--skip_questions", type=str, default=None, help="Comma-separated list of questions to skip")
parser.add_argument("--output_csv", default=None, type=str, help="Location to output CSV file")
parser.add_argument(
Expand Down Expand Up @@ -310,9 +310,19 @@ def warm_up_function():
else:
warmup_environment(args.requirements, parquet_folder)

if args.skip_questions is not None:
if args.questions is not None:
raise ValueError("Cannot specify both --questions and --skip_questions")
skip_questions = {int(s) for s in args.skip_questions.split(",")}
questions = [q for q in range(1, MetricsBuilder.NUM_TPCH_QUESTIONS + 1) if q not in skip_questions]
elif args.questions is not None:
questions = sorted(set(int(s) for s in args.questions.split(",")))
else:
questions = list(range(1, MetricsBuilder.NUM_TPCH_QUESTIONS + 1))

run_all_benchmarks(
parquet_folder,
skip_questions={int(s) for s in args.skip_questions.split(",")} if args.skip_questions is not None else set(),
questions=questions,
csv_output_location=args.output_csv,
ray_job_dashboard_url=args.ray_job_dashboard_url,
requirements=args.requirements,
Expand Down
11 changes: 4 additions & 7 deletions benchmarking/tpch/answers.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ def q4(get_df: GetDFFunc) -> DataFrame:
(col("O_ORDERDATE") >= datetime.date(1993, 7, 1)) & (col("O_ORDERDATE") < datetime.date(1993, 10, 1))
)

lineitems = lineitems.where(col("L_COMMITDATE") < col("L_RECEIPTDATE")).select(col("L_ORDERKEY")).distinct()
lineitems = lineitems.where(col("L_COMMITDATE") < col("L_RECEIPTDATE"))

daft_df = (
lineitems.join(orders, left_on=col("L_ORDERKEY"), right_on=col("O_ORDERKEY"))
orders.join(lineitems, left_on=col("O_ORDERKEY"), right_on=col("L_ORDERKEY"), how="semi")
.groupby(col("O_ORDERPRIORITY"))
.agg(col("L_ORDERKEY").count().alias("order_count"))
.agg(col("O_ORDERKEY").count().alias("order_count"))
.sort(col("O_ORDERPRIORITY"))
)
return daft_df
Expand Down Expand Up @@ -660,11 +660,8 @@ def q22(get_df: GetDFFunc) -> DataFrame:
res_1.where(col("C_ACCTBAL") > 0).agg(col("C_ACCTBAL").mean().alias("avg_acctbal")).with_column("lit", lit(1))
)

res_3 = orders.select("O_CUSTKEY")

daft_df = (
res_1.join(res_3, left_on="C_CUSTKEY", right_on="O_CUSTKEY", how="left")
.where(col("O_CUSTKEY").is_null())
res_1.join(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY", how="anti")
.with_column("lit", lit(1))
.join(res_2, on="lit")
.where(col("C_ACCTBAL") > col("avg_acctbal"))
Expand Down
2 changes: 1 addition & 1 deletion benchmarking/tpch/ray_job_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def ray_job_params(
) -> dict:
return dict(
submission_id=f"tpch-q{tpch_qnum}-{str(uuid.uuid4())[:4]}",
entrypoint=f"python {str(entrypoint.relative_to(working_dir))} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}",
entrypoint=f"python3 {str(entrypoint.relative_to(working_dir))} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}",
runtime_env={
"working_dir": str(working_dir),
**runtime_env,
Expand Down
2 changes: 2 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ ignore:
- tutorials
- tools
- daft/pickle
- src/arrow2
- src/parquet2

comment:
layout: reach, diff, flags, files
Expand Down
13 changes: 13 additions & 0 deletions daft/.ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
extend = "../.ruff.toml"

[lint]
extend-select = [
"TID253", # banned-module-level-imports, derived from flake8-tidy-imports
"TCH" # flake8-type-checking
]

[lint.flake8-tidy-imports]
# Ban certain modules from being imported at module level, instead requiring
# that they're imported lazily (e.g., within a function definition,
# with daft.lazy_import.LazyImport, or with TYPE_CHECKING).
banned-module-level-imports = ["daft.unity_catalog", "fsspec", "numpy", "pandas", "PIL", "pyarrow", "ray", "xml"]
2 changes: 1 addition & 1 deletion daft/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def refresh_logger() -> None:
read_lance,
)
from daft.series import Series
from daft.sql.sql import sql, sql_expr
from daft.sql import sql, sql_expr
from daft.udf import udf
from daft.viz import register_viz_hook

Expand Down
Loading

0 comments on commit 9db56c3

Please sign in to comment.