diff --git a/daft/expressions/expressions.py b/daft/expressions/expressions.py index a5e9baebf0..90c53503fa 100644 --- a/daft/expressions/expressions.py +++ b/daft/expressions/expressions.py @@ -425,7 +425,7 @@ def download( ) -> Expression: """Treats each string as a URL, and downloads the bytes contents as a bytes column - ..NOTE:: + .. NOTE:: If you are observing excessive S3 issues (such as timeouts, DNS errors or slowdown errors) during URL downloads, you may wish to reduce the value of ``max_connections`` (defaults to 32) to reduce the amount of load you are placing on your S3 servers. diff --git a/docs/source/api_docs/input_output.rst b/docs/source/api_docs/input_output.rst index 7d49da4c79..4203debcec 100644 --- a/docs/source/api_docs/input_output.rst +++ b/docs/source/api_docs/input_output.rst @@ -8,16 +8,6 @@ Input/Output Configuration ------------- -.. NOTE:: - Daft is currently building out its own native code for reading/writing data. These configuration objects allow - users to control behavior when Daft runs native code, but otherwise will have no effect. - - These configurations are currently used in: - - 1. :func:`daft.read_parquet`: controls behavior when reading DataFrames Parquet files using the native downloader - 2. :func:`Expression.url.download() `: controls behavior when downloading bytes from URLs using the native downloader - 3. :func:`Table.read_parquet `: (Advanced usecase!) controls behavior when reading a Daft Table from a Parquet file - .. autosummary:: :nosignatures: :toctree: doc_gen/io_configs diff --git a/tutorials/feature_of_the_week/io_config.ipynb b/tutorials/feature_of_the_week/io_config.ipynb new file mode 100644 index 0000000000..2829c3fa78 --- /dev/null +++ b/tutorials/feature_of_the_week/io_config.ipynb @@ -0,0 +1,134 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install getdaft" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Feature of the Week: Input/Output Configurations (IOConfig)\n", + "\n", + "`IOConfig` is Daft's mechanism for controlling the behavior of data input/output from storage. It is useful for:\n", + "\n", + "1. **Providing credentials** for authenticating with cloud storage services\n", + "2. **Tuning performance** or reducing load on storage services\n", + "\n", + "For a deeper look at `IOConfig`, see: [IOConfig Documentation](https://www.getdaft.io/projects/docs/en/latest/api_docs/doc_gen/io_configs/daft.io.IOConfig.html?highlight=IOConfig)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Default IOConfig Behavior\n", + "\n", + "The default behavior for IOConfig is to automatically detect credentials on your machines." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import daft\n", + "\n", + "# By default, calls to AWS S3 will use credentials retrieved from the machine(s) that they are called from\n", + "#\n", + "# For AWS S3 services, the default mechanism is to look through a chain of possible \"providers\":\n", + "# https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials\n", + "df = daft.read_csv(\"s3://daft-public-data/file.csv\")\n", + "df.collect()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overriding the IOConfig\n", + "### Setting a Global Override\n", + "\n", + "Oftentimes you may want Daft to just use a certain configuration by default whenever it has to access storage such as S3, GCS or Azure Blob Store.\n", + "\n", + "> **Example:**\n", + ">\n", + "> An extremely common use-case is to create a set of temporary credentials once, and share that across all calls to data access happening in Daft.\n", + ">\n", + "> The example below demonstrates this with AWS S3's `boto3` Python SDK." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use the boto3 library to generate temporary credentials which can be used for S3 access\n", + "import boto3\n", + "session = boto3.session.Session()\n", + "creds = session.get_credentials()\n", + "\n", + "# Attach temporary credentials to a Daft IOConfig object\n", + "MY_IO_CONFIG = daft.io.IOConfig(\n", + " s3=daft.io.S3Config(\n", + " key_id=creds.access_key,\n", + " access_key=creds.secret_key,\n", + " session_token=creds.token,\n", + " )\n", + ")\n", + "\n", + "# Set the default config to `MY_IO_CONFIG` so that it is used in the absence of any overrides\n", + "daft.set_planning_config(default_io_config=MY_IO_CONFIG)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Overriding IOConfigs per-API call\n", + "\n", + "Daft also allows for more granular per-call overrides through the use of keyword arguments.\n", + "\n", + "This is extremely flexible, allowing you to use a different set of credentials to read from two different locations!\n", + "\n", + "Here we use `daft.read_csv` as an example, but the same `io_config=...` keyword arg also exists for other I/O related functionality such as:\n", + "\n", + "1. `daft.read_parquet`\n", + "2. `daft.read_json`\n", + "3. `Expression.url.download()`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# An \"Anonymous\" IOConfig will access storage **without credentials**, and can only access fully public data\n", + "MY_ANONYMOUS_IO_CONFIG = daft.io.IOConfig(s3=daft.io.S3Config(anonymous=True))\n", + "\n", + "# Read this file using `MY_ANONYMOUS_IO_CONFIG` instead of the overridden global config `MY_IO_CONFIG`\n", + "df1 = daft.read_csv(\"s3://daft-public-data/melbourne-airbnb/melbourne_airbnb.csv\", io_config=MY_ANONYMOUS_IO_CONFIG)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}