From 9c66a5e07d6e43202685cf2ae2540a48b165eef5 Mon Sep 17 00:00:00 2001 From: Kev Wang Date: Wed, 21 Feb 2024 10:42:43 -0800 Subject: [PATCH] [DOCS] Add documentation for using and developing Daft on Ray (#1896) --- CONTRIBUTING.md | 11 +++ docs/source/user_guide/integrations.rst | 1 + docs/source/user_guide/integrations/ray.rst | 90 +++++++++++++++++++++ 3 files changed, 102 insertions(+) create mode 100644 docs/source/user_guide/integrations/ray.rst diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5f13ac06bf..793e4c6c8c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -36,3 +36,14 @@ To set up your development environment: 1. `make build`: recompile your code after modifying any Rust code in `src/` 2. `make test`: run tests 3. `DAFT_RUNNER=ray make test`: set the runner to the Ray runner and run tests (DAFT_RUNNER defaults to `py`) + +### Developing with Ray + +Running a development version of Daft on a local Ray cluster is as simple as including `daft.context.set_runner_ray()` in your Python script and then building and executing it as usual. + +To use a remote Ray cluster, run the following steps on the same operating system version as your Ray nodes, in order to ensure that your binaries are executable on Ray. + +1. `mkdir wd`: this is the working directory, it will hold all the files to be submitted to Ray for a job +2. `ln -s daft wd/daft`: create a symbolic link from the Python module to the working directory +3. `make build-release`: an optimized build to ensure that the module is small enough to be successfully uploaded to Ray. Run this after modifying any Rust code in `src/` +4. `ray job submit --working-dir wd --address "http://:8265" -- python script.py`: submit `wd/script.py` to be run on Ray diff --git a/docs/source/user_guide/integrations.rst b/docs/source/user_guide/integrations.rst index 4a565cfa22..6fdd7adcfe 100644 --- a/docs/source/user_guide/integrations.rst +++ b/docs/source/user_guide/integrations.rst @@ -3,6 +3,7 @@ Integrations .. toctree:: + integrations/ray integrations/iceberg integrations/microsoft-azure integrations/aws diff --git a/docs/source/user_guide/integrations/ray.rst b/docs/source/user_guide/integrations/ray.rst new file mode 100644 index 0000000000..3ea94cd20a --- /dev/null +++ b/docs/source/user_guide/integrations/ray.rst @@ -0,0 +1,90 @@ +Ray +=== + +`Ray `_ is an open-source framework for distributed computing. + +Daft's native support for Ray enables you to run distributed DataFrame workloads at scale. + +Usage +----- + +You can run Daft on Ray in two ways: by using the `Ray Client `_ or by submitting a Ray job. + +Ray Client +********** +The Ray client is quick way to get started with running tasks and retrieving their results on Ray using Python. + +.. WARNING:: + To run tasks using the Ray client, the version of Daft and the minor version (eg. 3.9, 3.10) of Python must match between client and server. + +Here's an example of how you can use the Ray client with Daft: + +.. code:: python + + >>> import daft + >>> import ray + >>> + >>> # Refer to the note under "Ray Job" for details on "runtime_env" + >>> ray.init("ray://:10001", runtime_env={"pip": ["getdaft"]}) + >>> + >>> # Starts the Ray client and tells Daft to use Ray to execute queries + >>> # If ray.init() has already been called, it uses the existing client + >>> daft.context.set_runner_ray("ray://:10001") + >>> + >>> df = daft.from_pydict({ + >>> "a": [3, 2, 5, 6, 1, 4], + >>> "b": [True, False, False, True, True, False] + >>> }) + >>> df = df.where(df["b"]).sort(df["a"]) + >>> + >>> # Daft executes the query remotely and returns a preview to the client + >>> df.collect() + ╭───────┬─────────╮ + │ a ┆ b │ + │ --- ┆ --- │ + │ Int64 ┆ Boolean │ + ╞═══════╪═════════╡ + │ 1 ┆ true │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ 3 ┆ true │ + ├╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤ + │ 6 ┆ true │ + ╰───────┴─────────╯ + + (Showing first 3 of 3 rows) + +Ray Job +******* +Ray jobs allow for more control and observability over using the Ray client. In addition, your entire code runs on Ray, which means it is not constrained by the compute, network, library versions, or availability of your local machine. + +.. code:: python + + # wd/job.py + + import daft + + def main(): + # call without any arguments to connect to Ray from the head node + daft.context.set_runner_ray() + + # ... Run Daft commands here ... + + if __name__ == "__main__": + main() + +To submit this script as a job, use the Ray CLI, which can be installed with `pip install "ray[default]"`. + +.. code:: sh + + ray job submit \ + --working-dir wd \ + --address "http://:8265" \ + --runtime-env-json '{"pip": ["getdaft"]}' \ + -- python job.py + +.. NOTE:: + + The runtime env parameter specifies that Daft should be installed on the Ray workers. Alternative methods of including Daft in the worker dependencies can be found `here `_. + + +For more information about Ray jobs, see `Ray docs -> Ray Jobs Overview `_.