Merge branch main into colin/swordfish-outer-join

Eventual-Inc · Sep 25, 2024 · 9db56c3 · 9db56c3
2 parents 94245f8 + 412dfbc
commit 9db56c3
Show file tree

Hide file tree

Showing 395 changed files with 7,449 additions and 4,581 deletions.
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -0,0 +1,45 @@
+name: Bug report
+description: Create a report to help us improve Daft
+labels: [bug, needs triage]
+body:
+- type: textarea
+  attributes:
+    label: Describe the bug
+    description: Describe the bug.
+    placeholder: >
+      A clear and concise description of what the bug is.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: To Reproduce
+    placeholder: >
+      Steps to reproduce the behavior:
+- type: textarea
+  attributes:
+    label: Expected behavior
+    placeholder: >
+      A clear and concise description of what you expected to happen.
+- type: dropdown
+  id: component
+  attributes:
+    label: Component(s)
+    multiple: true
+    options:
+    - Expressions
+    - SQL
+    - Python Runner
+    - Ray Runner
+    - Parquet
+    - CSV
+    - Continuous Integration
+    - Developer Tools
+    - Documentation
+    - Other
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: Additional context
+    placeholder: >
+      Add any other context about the problem here.
diff --git a/.github/workflows/nightlies-tests.yml b/.github/workflows/nightlies-tests.yml
@@ -32,7 +32,7 @@ jobs:
         fetch-depth: 0
 
     - name: Install uv
-      uses: astral-sh/setup-uv@v2
+      uses: astral-sh/setup-uv@v3
     - name: Set up Python ${{ env.PYTHON_VERSION }}
       uses: actions/setup-python@v5
       with:
@@ -103,7 +103,7 @@ jobs:
       if: runner.os == 'macos'
 
     - name: Install uv
-      uses: astral-sh/setup-uv@v2
+      uses: astral-sh/setup-uv@v3
     - name: Set up Python ${{ env.PYTHON_VERSION }}
       uses: actions/setup-python@v5
       with:
@@ -127,7 +127,7 @@ jobs:
         role-to-assume: ${{ secrets.ACTIONS_AWS_ROLE_ARN }}
         role-session-name: DaftPythonPackageGitHubWorkflow
     - name: Spin up IO services
-      uses: isbang/[email protected].0
+      uses: isbang/[email protected].2
       with:
         compose-file: ./tests/integration/io/docker-compose/docker-compose.yml
         down-flags: --volumes

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -330,7 +330,7 @@ jobs:
         mkdir -p /tmp/daft-integration-testing/nginx
         chmod +rw /tmp/daft-integration-testing/nginx
     - name: Spin up IO services
-      uses: isbang/[email protected].0
+      uses: isbang/[email protected].2
       with:
         compose-file: ./tests/integration/io/docker-compose/docker-compose.yml
         down-flags: --volumes
@@ -427,7 +427,7 @@ jobs:
         # workload_identity_provider: ${{ secrets.ACTIONS_GCP_WORKLOAD_IDENTITY_PROVIDER }}
         # service_account: ${{ secrets.ACTIONS_GCP_SERVICE_ACCOUNT }}
     - name: Spin up IO services
-      uses: isbang/[email protected].0
+      uses: isbang/[email protected].2
       with:
         compose-file: ./tests/integration/io/docker-compose/docker-compose.yml
         down-flags: --volumes

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -72,7 +72,7 @@ publish = false
 version = "0.3.0-dev0"
 
 [package.metadata.cargo-machete]
-ignored = ["lzma_sys"]
+ignored = ["lzma-sys"]
 
 [patch.crates-io]
 arrow2 = {path = "src/arrow2"}
@@ -149,6 +149,7 @@ bytes = "1.6.0"
 chrono = "0.4.38"
 chrono-tz = "0.8.4"
 comfy-table = "7.1.1"
+derivative = "2.2.0"
 dyn-clone = "1"
 futures = "0.3.30"
 html-escape = "0.2.13"
@@ -161,6 +162,7 @@ jaq-std = "1.2.0"
 num-derive = "0.3.3"
 num-traits = "0.2"
 once_cell = "1.19.0"
+path_macro = "1.0.0"
 pretty_assertions = "1.4.0"
 rand = "^0.8"
 rayon = "1.10.0"
@@ -169,7 +171,7 @@ rstest = "0.18.2"
 serde_json = "1.0.116"
 sketches-ddsketch = {version = "0.2.2", features = ["use_serde"]}
 snafu = {version = "0.7.4", features = ["futures"]}
-sqlparser = "0.49.0"
+sqlparser = "0.51.0"
 sysinfo = "0.30.12"
 test-log = "0.2.16"
 thiserror = "1.0.63"
@@ -222,6 +224,9 @@ version = "0.11.0"
 features = ["derive", "rc"]
 version = "1.0.200"
 
+[workspace.lints.clippy]
+use-self = "deny"
+
 [workspace.package]
 edition = "2021"
 version = "0.3.0-dev0"
diff --git a/benchmarking/tpch/__main__.py b/benchmarking/tpch/__main__.py
@@ -123,7 +123,7 @@ def _get_df(table_name: str) -> DataFrame:
 
 def run_all_benchmarks(
     parquet_folder: str,
-    skip_questions: set[int],
+    questions: list[int],
     csv_output_location: str | None,
     ray_job_dashboard_url: str | None = None,
     requirements: str | None = None,
@@ -133,11 +133,7 @@ def run_all_benchmarks(
     daft_context = get_context()
     metrics_builder = MetricsBuilder(daft_context.runner_config.name)
 
-    for i in range(1, 23):
-        if i in skip_questions:
-            logger.warning("Skipping TPC-H q%s", i)
-            continue
-
+    for i in questions:
         # Run as a Ray Job if dashboard URL is provided
         if ray_job_dashboard_url is not None:
             from benchmarking.tpch import ray_job_runner
@@ -202,7 +198,10 @@ def get_ray_runtime_env(requirements: str | None) -> dict:
     runtime_env = {
         "py_modules": [daft],
         "eager_install": True,
-        "env_vars": {"DAFT_PROGRESS_BAR": "0"},
+        "env_vars": {
+            "DAFT_PROGRESS_BAR": "0",
+            "DAFT_RUNNER": "ray",
+        },
     }
     if requirements:
         runtime_env.update({"pip": requirements})
@@ -266,6 +265,7 @@ def warm_up_function():
     parser.add_argument(
         "--num_parts", default=None, help="Number of parts to generate (defaults to 1 part per GB)", type=int
     )
+    parser.add_argument("--questions", type=str, default=None, help="Comma-separated list of questions to run")
     parser.add_argument("--skip_questions", type=str, default=None, help="Comma-separated list of questions to skip")
     parser.add_argument("--output_csv", default=None, type=str, help="Location to output CSV file")
     parser.add_argument(
@@ -310,9 +310,19 @@ def warm_up_function():
     else:
         warmup_environment(args.requirements, parquet_folder)
 
+    if args.skip_questions is not None:
+        if args.questions is not None:
+            raise ValueError("Cannot specify both --questions and --skip_questions")
+        skip_questions = {int(s) for s in args.skip_questions.split(",")}
+        questions = [q for q in range(1, MetricsBuilder.NUM_TPCH_QUESTIONS + 1) if q not in skip_questions]
+    elif args.questions is not None:
+        questions = sorted(set(int(s) for s in args.questions.split(",")))
+    else:
+        questions = list(range(1, MetricsBuilder.NUM_TPCH_QUESTIONS + 1))
+
     run_all_benchmarks(
         parquet_folder,
-        skip_questions={int(s) for s in args.skip_questions.split(",")} if args.skip_questions is not None else set(),
+        questions=questions,
         csv_output_location=args.output_csv,
         ray_job_dashboard_url=args.ray_job_dashboard_url,
         requirements=args.requirements,

diff --git a/benchmarking/tpch/answers.py b/benchmarking/tpch/answers.py
@@ -106,12 +106,12 @@ def q4(get_df: GetDFFunc) -> DataFrame:
         (col("O_ORDERDATE") >= datetime.date(1993, 7, 1)) & (col("O_ORDERDATE") < datetime.date(1993, 10, 1))
     )
 
-    lineitems = lineitems.where(col("L_COMMITDATE") < col("L_RECEIPTDATE")).select(col("L_ORDERKEY")).distinct()
+    lineitems = lineitems.where(col("L_COMMITDATE") < col("L_RECEIPTDATE"))
 
     daft_df = (
-        lineitems.join(orders, left_on=col("L_ORDERKEY"), right_on=col("O_ORDERKEY"))
+        orders.join(lineitems, left_on=col("O_ORDERKEY"), right_on=col("L_ORDERKEY"), how="semi")
         .groupby(col("O_ORDERPRIORITY"))
-        .agg(col("L_ORDERKEY").count().alias("order_count"))
+        .agg(col("O_ORDERKEY").count().alias("order_count"))
         .sort(col("O_ORDERPRIORITY"))
     )
     return daft_df
@@ -660,11 +660,8 @@ def q22(get_df: GetDFFunc) -> DataFrame:
         res_1.where(col("C_ACCTBAL") > 0).agg(col("C_ACCTBAL").mean().alias("avg_acctbal")).with_column("lit", lit(1))
     )
 
-    res_3 = orders.select("O_CUSTKEY")
-
     daft_df = (
-        res_1.join(res_3, left_on="C_CUSTKEY", right_on="O_CUSTKEY", how="left")
-        .where(col("O_CUSTKEY").is_null())
+        res_1.join(orders, left_on="C_CUSTKEY", right_on="O_CUSTKEY", how="anti")
         .with_column("lit", lit(1))
         .join(res_2, on="lit")
         .where(col("C_ACCTBAL") > col("avg_acctbal"))

diff --git a/benchmarking/tpch/ray_job_runner.py b/benchmarking/tpch/ray_job_runner.py
@@ -57,7 +57,7 @@ def ray_job_params(
 ) -> dict:
     return dict(
         submission_id=f"tpch-q{tpch_qnum}-{str(uuid.uuid4())[:4]}",
-        entrypoint=f"python {str(entrypoint.relative_to(working_dir))} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}",
+        entrypoint=f"python3 {str(entrypoint.relative_to(working_dir))} --parquet-folder {parquet_folder_path} --question-number {tpch_qnum}",
         runtime_env={
             "working_dir": str(working_dir),
             **runtime_env,

diff --git a/codecov.yml b/codecov.yml
@@ -4,6 +4,8 @@ ignore:
 - tutorials
 - tools
 - daft/pickle
+- src/arrow2
+- src/parquet2
 
 comment:
   layout: reach, diff, flags, files

diff --git a/daft/.ruff.toml b/daft/.ruff.toml
@@ -0,0 +1,13 @@
+extend = "../.ruff.toml"
+
+[lint]
+extend-select = [
+  "TID253",  # banned-module-level-imports, derived from flake8-tidy-imports
+  "TCH"  # flake8-type-checking
+]
+
+[lint.flake8-tidy-imports]
+# Ban certain modules from being imported at module level, instead requiring
+# that they're imported lazily (e.g., within a function definition,
+# with daft.lazy_import.LazyImport, or with TYPE_CHECKING).
+banned-module-level-imports = ["daft.unity_catalog", "fsspec", "numpy", "pandas", "PIL", "pyarrow", "ray", "xml"]
diff --git a/daft/__init__.py b/daft/__init__.py
@@ -88,7 +88,7 @@ def refresh_logger() -> None:
     read_lance,
 )
 from daft.series import Series
-from daft.sql.sql import sql, sql_expr
+from daft.sql import sql, sql_expr
 from daft.udf import udf
 from daft.viz import register_viz_hook