diff --git a/.github/workflows/demos-docker-build.yaml b/.github/workflows/demos-docker-build.yaml index d269b0cc..ea5606e0 100644 --- a/.github/workflows/demos-docker-build.yaml +++ b/.github/workflows/demos-docker-build.yaml @@ -34,7 +34,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.7] + python-version: ['3.10'] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/notebooks.yaml b/.github/workflows/notebooks.yaml index 4330c174..6b7434fa 100644 --- a/.github/workflows/notebooks.yaml +++ b/.github/workflows/notebooks.yaml @@ -27,6 +27,10 @@ on: - '**/*.md' - '.github/renovate.json5' +concurrency: + group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + jobs: python: name: Testing Jupyter Notebooks @@ -34,12 +38,25 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.7] + python-version: ['3.10'] steps: - uses: actions/checkout@v3 - name: Install system dependencies run: sudo apt-get install libsasl2-dev libsasl2-modules + - name: Set up Java + uses: actions/setup-java@v3 + with: + distribution: 'temurin' + # Need Java 8 for Hive + 11 for Spark (and Nessie) + java-version: | + 8 + 11 + - name: setup JAVAx_HOME + run: | + echo "JAVA8_HOME=$JAVA_HOME_8_X64" >> ${GITHUB_ENV} + echo "JAVA11_HOME=$JAVA_HOME_11_X64" >> ${GITHUB_ENV} + echo "JAVA_HOME=$JAVA_HOME_11_X64" >> ${GITHUB_ENV} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -50,8 +67,22 @@ jobs: run: | python -m pip install --upgrade pip python -m pip install tox tox-gh-actions build + - name: Cache Tools (Hadoop, Hive, Spark) + id: cache-tools + uses: actions/cache@v3 + with: + path: | + notebooks/hadoop-* + notebooks/apache-hive-* + notebooks/spark-* + notebooks/iceberg-*.jar + notebooks/nessie-quarkus-*.jar + key: tools-cache-${{ hashFiles('docker/utils/__init__.py') }} + - name: Check Dockerfile has correct registry + run: | + grep -q 'FROM ghcr.io/projectnessie/nessie-binder-demos:.*' binder/Dockerfile - name: Check Dockerfile hash is up-to-date - if: github.actor != 'dependabot[bot]' + if: github.actor != 'renovate' run: | bash -ex .github/scripts/modify_dockerfile.sh changed_hash=$(git status --porcelain binder/Dockerfile) @@ -63,5 +94,10 @@ jobs: fi echo "PASSED: Dockerfile hash is up-to-date!" - name: Test Notebooks with Tox - working-directory: notebooks/tests + working-directory: notebooks/ run: tox + - name: Dump Hive output on error + working-directory: notebooks/ + if: failure() + run: | + cat nohup.out diff --git a/.gitignore b/.gitignore index ef780a61..c8821355 100644 --- a/.gitignore +++ b/.gitignore @@ -25,6 +25,7 @@ notebooks/iceberg-*-runtime-* notebooks/hadoop-* notebooks/apache-hive-*-bin notebooks/metastore_db +notebooks/hiveserver2.pid notebooks/*.log notebooks/*.out # using sed on mac always needs a backup file @@ -38,6 +39,9 @@ venv/ __pycache__/ .pytest_cache +# pyenv +.python-version + # Jetbrains IDEs /.idea *.iws diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index db3f99ce..42a0ed1c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,19 +16,19 @@ repos: - repo: https://github.com/psf/black - rev: 21.6b0 + rev: 22.8.0 hooks: - id: black language_version: python3 - repo: https://github.com/asottile/reorder_python_imports - rev: v2.5.0 + rev: v3.12.0 hooks: - id: reorder-python-imports args: - --py3-plus - --application-directories=pynessie:tests:python/pynessie:python:tests - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v4.5.0 hooks: - id: trailing-whitespace - id: check-added-large-files diff --git a/README.md b/README.md index 136a9e19..72d1abaa 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,6 @@ These demos run under binder and can be found at: * [Spark and Iceberg](https://mybinder.org/v2/gh/projectnessie/nessie-demos/main?labpath=notebooks%2Fnessie-iceberg-demo-nba.ipynb) -* [Spark and Delta](https://mybinder.org/v2/gh/projectnessie/nessie-demos/main?labpath=notebooks%2Fnessie-delta-demo-nba.ipynb) * [Flink and Iceberg](https://mybinder.org/v2/gh/projectnessie/nessie-demos/main?labpath=notebooks%2Fnessie-iceberg-flink-demo-nba.ipynb) * [Hive and Iceberg](https://mybinder.org/v2/gh/projectnessie/nessie-demos/main?labpath=notebooks%2Fnessie-iceberg-hive-demo-nba.ipynb) @@ -22,13 +21,7 @@ Nessie version is set in Binder at `docker/binder/requirements_base.txt`. Curren ### Iceberg -Currently we are using Iceberg `0.13.1` and it is specified in both iceberg notebooks as well as `docker/utils/__init__.py` - -### Delta - -Currently, the Delta version is taken directly from the Nessie version and isn't explicitly noted. It is currently `1.1.0-nessie` - -See https://github.com/projectnessie/nessie/blob/nessie-0.30.0/pom.xml#L171 +Currently we are using Iceberg `1.4.2` and it is specified in both iceberg notebooks as well as `docker/utils/__init__.py` ### Spark @@ -37,7 +30,7 @@ Only has to be updated in `docker/binder/requirements.txt`. Currently, Iceberg s ### Flink -Flink version is set in Binder at `docker/binder/requirements_flink.txt`. Currently, we are using `1.13.6`. +Flink version is set in Binder at `docker/binder/requirements_flink.txt`. Currently, we are using `1.17.1`. ### Hadoop @@ -60,7 +53,7 @@ Of course, Binder just lets a user "simply start" a notebook via a simple "click ## Development For development, you will need to make sure to have the following installed: -- Python 3.7+ +- Python 3.10+ - pre-commit Regarding pre-commit, you will need to make sure is installed through `pre-commit install` in order to install the hooks locally since this repo diff --git a/binder/Dockerfile b/binder/Dockerfile index 53efebc4..b0587230 100644 --- a/binder/Dockerfile +++ b/binder/Dockerfile @@ -2,7 +2,7 @@ # Tag will be automatically generated through pre-commit hook if any changes # happened in the docker/ folder -FROM ghcr.io/projectnessie/nessie-binder-demos:649ec80b8fa7d9666178380a33b2e645a52d5985 +FROM ghcr.io/projectnessie/nessie-binder-demos:dd32c4413d91c22676121f62119bcc7f167e4752 # Create the necessary folders for the demo, this will be created and owned by {NB_USER} RUN mkdir -p notebooks && mkdir -p datasets diff --git a/binder/README.md b/binder/README.md index ba7274a6..2d1b1887 100644 --- a/binder/README.md +++ b/binder/README.md @@ -1,8 +1,8 @@ ## Building binder locally ### Prerequisites -You need to have a python 3.7+ installed. -We recommend to use [pyenv](https://github.com/pyenv/pyenv) for managing your python environment(s). +You need to have a python 3.10+ installed. +We recommend to use [pyenv](https://github.com/pyenv/pyenv) for managing your python environment(s). To build the binder image locally, firstly, you need to install `jupyter-repo2docker` dependency: @@ -29,8 +29,8 @@ Run (or look into) the `build_run_local_docker.sh` script how to do this semi-au After those steps, the binder should be running on your local machine. Next, find the output similar to this: ```shell -[C 13:38:25.199 NotebookApp] - +[C 13:38:25.199 NotebookApp] + To access the notebook, open this file in a browser: file:///home/jovyan/.local/share/jupyter/runtime/nbserver-40-open.html Or copy and paste this URL: diff --git a/docker/binder/apt.txt b/docker/binder/apt.txt index 84c10603..0ed4240f 100644 --- a/docker/binder/apt.txt +++ b/docker/binder/apt.txt @@ -16,9 +16,12 @@ # Packages needed for mybinder.org +openjdk-8-jdk-headless openjdk-11-jdk-headless # SASL lib needed for thrift API to access Hive libsasl2-dev libsasl2-modules # for removal of duplicate files rdfind +# need `netstat` for start scripts +net-tools diff --git a/docker/binder/postBuild b/docker/binder/postBuild index d23de2c6..d7ba9374 100644 --- a/docker/binder/postBuild +++ b/docker/binder/postBuild @@ -26,7 +26,7 @@ python -m ipykernel install --name "flink-demo" --user python -c "import utils;utils._copy_all_hadoop_jars_to_pyflink()" conda deactivate -python -c "import utils;utils.fetch_nessie()" +python -c "import utils;utils.fetch_nessie_jar()" python -c "import utils;utils.fetch_spark()" diff --git a/docker/binder/requirements.txt b/docker/binder/requirements.txt index a95841b6..065e6ff7 100644 --- a/docker/binder/requirements.txt +++ b/docker/binder/requirements.txt @@ -1,5 +1,7 @@ -r requirements_base.txt findspark==2.0.1 -pandas==1.3.5 -pyhive[hive]==0.6.5 -pyspark==3.2.1 +# Need this numpy version due to compatibility reasons with numpy/pyspark +numpy==1.21.6 +pandas==1.5.3 +pyhive[hive_pure_sasl]==0.7.0 +pyspark==3.2.4 diff --git a/docker/binder/requirements_base.txt b/docker/binder/requirements_base.txt index b842f398..303085b3 100644 --- a/docker/binder/requirements_base.txt +++ b/docker/binder/requirements_base.txt @@ -1 +1 @@ -pynessie==0.30.0 +pynessie==0.65.0 diff --git a/docker/binder/requirements_flink.txt b/docker/binder/requirements_flink.txt index 3e1775e6..664b2d3f 100644 --- a/docker/binder/requirements_flink.txt +++ b/docker/binder/requirements_flink.txt @@ -1,4 +1,2 @@ -r requirements_base.txt -apache-flink==1.13.6 -# flink requires pandas<1.2.0 see https://github.com/apache/flink/blob/release-1.13.6/flink-python/setup.py#L313 -pandas==1.1.5 +apache-flink==1.17.1 diff --git a/docker/binder/runtime.txt b/docker/binder/runtime.txt new file mode 100644 index 00000000..55090899 --- /dev/null +++ b/docker/binder/runtime.txt @@ -0,0 +1 @@ +python-3.10 diff --git a/docker/binder/start b/docker/binder/start index 18c41941..6531ae7e 100755 --- a/docker/binder/start +++ b/docker/binder/start @@ -15,19 +15,33 @@ # limitations under the License. # -nohup ./nessie-quarkus-runner & - SPARK_VERSION=$(python -c "import utils;print(utils._SPARK_VERSION)") HADOOP_VERSION=$(python -c "import utils;print(utils._HADOOP_VERSION)") HIVE_VERSION=$(python -c "import utils;print(utils._HIVE_VERSION)") -export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64 +export JAVA11_HOME=/usr/lib/jvm/java-11-openjdk-amd64 +export JAVA8_HOME=/usr/lib/jvm/java-8-openjdk-amd64 +export JAVA_HOME=$JAVA11_HOME +export PATH=$JAVA_HOME/bin:$PATH + +nohup java -jar nessie-quarkus-runner.jar > nohup-nessie.out & +# Wait until Nessie is ready to accept requests. +echo "Waiting up to 180 seconds for Nessie to be ready..." +for _ in seq 1 180 ; do + (netstat -ant | grep -q ':19120 .* LISTEN') && break + sleep 1 +done +if [[ $(netstat -ant | grep -q ':19120 .* LISTEN') -gt 0 ]] ; then + echo "Nessie did not start / not listening on port 19120!" + exit 1 +fi +echo "Nessie listening on port 19120." export SPARK_HOME=$PWD/spark-$SPARK_VERSION-bin-hadoop3.2 export HADOOP_HOME=$PWD/hadoop-$HADOOP_VERSION #Start Hive chmod +x $PWD/binder/start.hive -nohup $PWD/binder/start.hive $PWD $PWD/binder/resources $HIVE_VERSION +nohup $PWD/binder/start.hive $PWD $PWD/binder/resources $HIVE_VERSION > nohup-hive.out exec "$@" diff --git a/docker/binder/start.hive b/docker/binder/start.hive index c8ddd968..22f0e7e2 100755 --- a/docker/binder/start.hive +++ b/docker/binder/start.hive @@ -20,6 +20,8 @@ RESOURCE_DIR=$2 HIVE_VERSION=$3 HIVE_FOLDER_NAME="apache-hive-$HIVE_VERSION-bin" HIVE_WAREHOUSE_DIR=$HIVE_PARENT_DIR/hive_warehouse +HIVE_PID_FILE=$HIVE_PARENT_DIR/hiveserver2.pid +HIVE_DB=$HIVE_PARENT_DIR/metastore_db if [ -z "$HIVE_PARENT_DIR" ]; then echo "Input the parent dir as the first argument" @@ -38,15 +40,45 @@ fi export HIVE_HOME=$HIVE_PARENT_DIR/$HIVE_FOLDER_NAME -# Create hive warehouse folder -mkdir $HIVE_WAREHOUSE_DIR - # Copy the needed configs to Hive folder cp $RESOURCE_DIR/hive/config/hive-site.xml ${HIVE_HOME}/conf/ # Set Hive warehouse path in the hive-site.xml sed -i.bak "s~HIVE_WAREHOUSE_DIR~$HIVE_WAREHOUSE_DIR~g" ${HIVE_HOME}/conf/hive-site.xml +# Check for Java 8 + 11 for tox (also in /notebooks/tests/scripts/start_hive) +if [[ -z ${JAVA8_HOME} || -z ${JAVA11_HOME} || ! -d ${JAVA8_HOME} || ! -d ${JAVA11_HOME} ]] ; then + cat < /dev/stderr + + +============================================================================================================ +Define the JAVA8_HOME and JAVA11_HOME environment variables to point to Java 8 and Java 11 development kits. +============================================================================================================ + +Need Java 8 for Hive server to work. +Java 11 (not newer!) is required for Spark, but also Nessie. + + +! + exit 1 +fi + +# Kill an already running hiveserver +if [[ -f $HIVE_PID_FILE ]] ; then + kill "$(cat $HIVE_PID_FILE)" || true + rm $HIVE_PID_FILE +fi + +# Remove an already metastore-db +if [[ -d $HIVE_DB ]] ; then + echo "Removing existing $HIVE_DB" + rm -rf $HIVE_DB +fi + +# (Re-)create hive warehouse folder +rm -rf $HIVE_WAREHOUSE_DIR +mkdir -p $HIVE_WAREHOUSE_DIR + # Initialize Hive's Derby database $HIVE_HOME/bin/schematool -dbType derby -initSchema echo "Finished initializing Derby database for Hive." @@ -54,5 +86,38 @@ echo "Finished initializing Derby database for Hive." # increase the Heap memory being used by Hive-MapReduce jobs export HADOOP_HEAPSIZE=1500 +# Use Java 8 for Hive :facepalm: +OLD_PATH="$PATH" +export PATH="$JAVA8_HOME/bin:$PATH" +export JAVA_HOME=$JAVA8_HOME +cat < $HIVE_PID_FILE +echo "... PID is $(cat $HIVE_PID_FILE)" + +# Wait until Hive is ready to accept requests via Thrift. Hive may take some time to start in CI. +echo "Waiting up to 180 seconds for Hive to be ready..." +for _ in seq 1 180 ; do + (netstat -ant | grep -q ':10000 .* LISTEN') && break + sleep 1 +done +if [[ $(netstat -ant | grep -q ':10000 .* LISTEN') -gt 0 ]] ; then + echo "Hive did not start / not listening on port 10000 (Thrift)!" + exit 1 +fi +echo "Hive listening on port 10000 (Thrift)." + +# Reset environment +export JAVA_HOME=$JAVA11_HOME +export PATH=$OLD_PATH diff --git a/docker/utils/__init__.py b/docker/utils/__init__.py index 4434fbd7..2907642f 100644 --- a/docker/utils/__init__.py +++ b/docker/utils/__init__.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- # # Copyright (C) 2020 Dremio # @@ -18,7 +19,6 @@ import os import shutil import site -import stat import sysconfig import tarfile from typing import Optional @@ -36,14 +36,18 @@ _SPARK_FILENAME = None _SPARK_URL = None -_HADOOP_VERSION = "2.10.1" +_NESSIE_VERSION = "0.74.0" + +_HADOOP_VERSION = "2.10.2" _HADOOP_FILENAME = f"hadoop-{_HADOOP_VERSION}" _HADOOP_URL = f"https://archive.apache.org/dist/hadoop/common/hadoop-{_HADOOP_VERSION}/{_HADOOP_FILENAME}.tar.gz" -_FLINK_MAJOR_VERSION = "1.13" +_FLINK_MAJOR_VERSION = "1.17" -_ICEBERG_VERSION = "0.13.1" -_ICEBERG_FLINK_FILENAME = f"iceberg-flink-runtime-{_FLINK_MAJOR_VERSION}-{_ICEBERG_VERSION}.jar" +_ICEBERG_VERSION = "1.4.2" +_ICEBERG_FLINK_FILENAME = ( + f"iceberg-flink-runtime-{_FLINK_MAJOR_VERSION}-{_ICEBERG_VERSION}.jar" +) _ICEBERG_FLINK_URL = f"https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-{_FLINK_MAJOR_VERSION}/{_ICEBERG_VERSION}/{_ICEBERG_FLINK_FILENAME}" _ICEBERG_HIVE_FILENAME = f"iceberg-hive-runtime-{_ICEBERG_VERSION}.jar" _ICEBERG_HIVE_URL = f"https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-hive-runtime/{_ICEBERG_VERSION}/{_ICEBERG_HIVE_FILENAME}" @@ -55,7 +59,9 @@ ) -def _link_file_into_dir(source_file: str, target_dir: str, replace_if_exists=True) -> None: +def _link_file_into_dir( + source_file: str, target_dir: str, replace_if_exists=True +) -> None: assert os.path.isfile(source_file) assert os.path.isdir(target_dir) @@ -75,7 +81,7 @@ def _link_file_into_dir(source_file: str, target_dir: str, replace_if_exists=Tru os.link(source_file, target_file) assert os.path.isfile(target_file), (source_file, target_file) - action = 'replaced' if replaced else 'created' + action = "replaced" if replaced else "created" print(f"Link target was {action}: {target_file} (source: {source_file})") @@ -112,7 +118,9 @@ def _copy_all_hadoop_jars_to_pyflink() -> None: pyflink_lib_dir = _find_pyflink_lib_dir() for _jar_count, jar in enumerate(_jar_files()): _link_file_into_dir(jar, pyflink_lib_dir) - print(f"Linked {_jar_count} HADOOP jar files into the pyflink lib dir at location {pyflink_lib_dir}") + print( + f"Linked {_jar_count} HADOOP jar files into the pyflink lib dir at location {pyflink_lib_dir}" + ) def _find_pyflink_lib_dir() -> Optional[str]: @@ -139,16 +147,6 @@ def _download_file(filename: str, url: str) -> None: f.write(r.content) -def fetch_nessie() -> str: - """Download nessie executable.""" - runner = "nessie-quarkus-runner" - - url = _get_base_nessie_url() - _download_file(runner, url) - os.chmod(runner, os.stat(runner).st_mode | stat.S_IXUSR) - return runner - - def fetch_nessie_jar() -> str: """Download nessie Jar in order to run the tests in Mac""" runner = "nessie-quarkus-runner.jar" @@ -159,12 +157,8 @@ def fetch_nessie_jar() -> str: def _get_base_nessie_url() -> str: - import pynessie - - version = pynessie.__version__ - return "https://github.com/projectnessie/nessie/releases/download/nessie-{}/nessie-quarkus-{}-runner".format( - version, version + _NESSIE_VERSION, _NESSIE_VERSION ) diff --git a/notebooks/nessie-delta-demo-nba.ipynb b/notebooks/nessie-delta-demo-nba.ipynb deleted file mode 100644 index 6d8f5e2e..00000000 --- a/notebooks/nessie-delta-demo-nba.ipynb +++ /dev/null @@ -1,1666 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Nessie Spark SQL Demo with NBA Dataset\n", - "============================\n", - "This demo showcases how to use Nessie Python API along with Spark3 from Delta Lake\n", - "\n", - "Initialize Pyspark\n", - "----------------------------------------------\n", - "To get started, we will first have to do a few setup steps that give us everything we need\n", - "to get started with Nessie. In case you're interested in the detailed setup steps for Spark, you can check out the [docs](https://projectnessie.org/tools/deltalake/spark/)\n", - "\n", - "The Binder server has downloaded spark and some data for us as well as started a Nessie server in the background. All we have to do is start Spark\n", - "\n", - "The below cell starts a local Spark session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: An illegal reflective access operation has occurred\n", - "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", - "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", - "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", - "WARNING: All illegal access operations will be denied in a future release\n", - "https://storage.googleapis.com/nessie-maven added as a remote repository with the name: repo-1\n", - "Ivy Default Cache set to: /home/jovyan/.ivy2/cache\n", - "The jars for the packages stored in: /home/jovyan/.ivy2/jars\n", - "org.projectnessie#nessie-deltalake added as a dependency\n", - "org.projectnessie#nessie-spark-3.2-extensions added as a dependency\n", - ":: resolving dependencies :: org.apache.spark#spark-submit-parent-2ab7f1e0-01bb-42fd-bb2f-6c1b59cdc6dd;1.0\n", - "\tconfs: [default]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":: loading settings :: url = jar:file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\tfound org.projectnessie#nessie-deltalake;0.30.0 in central\n", - "\tfound org.antlr#antlr4-runtime;4.9.2 in central\n", - "\tfound org.projectnessie#nessie-spark-3.2-extensions;0.30.0 in central\n", - "downloading https://repo1.maven.org/maven2/org/projectnessie/nessie-deltalake/0.30.0/nessie-deltalake-0.30.0.jar ...\n", - "\t[SUCCESSFUL ] org.projectnessie#nessie-deltalake;0.30.0!nessie-deltalake.jar (375ms)\n", - "downloading https://repo1.maven.org/maven2/org/projectnessie/nessie-spark-3.2-extensions/0.30.0/nessie-spark-3.2-extensions-0.30.0.jar ...\n", - "\t[SUCCESSFUL ] org.projectnessie#nessie-spark-3.2-extensions;0.30.0!nessie-spark-3.2-extensions.jar (87ms)\n", - "downloading https://repo1.maven.org/maven2/org/antlr/antlr4-runtime/4.9.2/antlr4-runtime-4.9.2.jar ...\n", - "\t[SUCCESSFUL ] org.antlr#antlr4-runtime;4.9.2!antlr4-runtime.jar (59ms)\n", - ":: resolution report :: resolve 19292ms :: artifacts dl 524ms\n", - "\t:: modules in use:\n", - "\torg.antlr#antlr4-runtime;4.9.2 from central in [default]\n", - "\torg.projectnessie#nessie-deltalake;0.30.0 from central in [default]\n", - "\torg.projectnessie#nessie-spark-3.2-extensions;0.30.0 from central in [default]\n", - "\t---------------------------------------------------------------------\n", - "\t| | modules || artifacts |\n", - "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", - "\t---------------------------------------------------------------------\n", - "\t| default | 3 | 3 | 3 | 0 || 3 | 3 |\n", - "\t---------------------------------------------------------------------\n", - ":: retrieving :: org.apache.spark#spark-submit-parent-2ab7f1e0-01bb-42fd-bb2f-6c1b59cdc6dd\n", - "\tconfs: [default]\n", - "\t3 artifacts copied, 0 already retrieved (4023kB/6ms)\n", - "22/05/24 07:49:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", - "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", - "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Spark Running\n" - ] - } - ], - "source": [ - "import os\n", - "import findspark\n", - "from pyspark.sql import *\n", - "from pyspark import SparkConf\n", - "import pynessie\n", - "\n", - "findspark.init()\n", - "pynessie_version = pynessie.__version__\n", - "\n", - "warehouse = \"file://\" + os.getcwd() + \"/spark_warehouse/delta\"\n", - "conf = SparkConf()\n", - "# we add our custom fork of delta to the known repositories\n", - "conf.set(\"spark.jars.repositories\", \"https://storage.googleapis.com/nessie-maven\")\n", - "# we need delta libraries and the nessie sql extensions\n", - "conf.set(\n", - " \"spark.jars.packages\",\n", - " f\"org.projectnessie:nessie-deltalake:{pynessie_version},org.projectnessie:nessie-spark-3.2-extensions:{pynessie_version}\",\n", - ")\n", - "# ensure python <-> java interactions are w/ pyarrow\n", - "conf.set(\"spark.sql.execution.pyarrow.enabled\", \"true\")\n", - "# create catalog dev_catalog as a Delta catalog\n", - "conf.set(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n", - "# set the location for Nessie catalog to store data. Spark writes to this directory\n", - "conf.set(\"spark.sql.catalog.spark_catalog.warehouse\", warehouse)\n", - "# set the location of the nessie server. In this demo its running locally. There are many ways to run it (see https://projectnessie.org/try/)\n", - "conf.set(\"spark.sql.catalog.spark_catalog.uri\", \"http://localhost:19120/api/v1\")\n", - "# default branch for Nessie catalog to work on\n", - "conf.set(\"spark.sql.catalog.spark_catalog.ref\", \"main\")\n", - "# use no authorization. Options are NONE AWS BASIC and aws implies running Nessie on a lambda\n", - "conf.set(\"spark.sql.catalog.spark_catalog.auth_type\", \"NONE\")\n", - "# These two lines tell Delta to use Nessie as the internal storage handler thereby enabling Delta/Nessie integraton\n", - "conf.set(\"spark.delta.logFileHandler.class\", \"org.projectnessie.deltalake.NessieLogFileMetaParser\")\n", - "conf.set(\"spark.delta.logStore.class\", \"org.projectnessie.deltalake.NessieLogStore\")\n", - "# enable the extensions for both Nessie and Delta\n", - "conf.set(\n", - " \"spark.sql.extensions\",\n", - " \"io.delta.sql.DeltaSparkSessionExtension,org.projectnessie.spark.extensions.NessieSpark32SessionExtensions\",\n", - ")\n", - "# finally, start up the Spark server\n", - "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", - "print(\"Spark Running\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Solving Data Engineering problems with Nessie\n", - "============================\n", - "\n", - "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", - "\n", - "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set up Nessie branches\n", - "----------------------------\n", - "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", - "\n", - "- Create a new branch named `dev`\n", - "- List all branches\n", - "\n", - "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchdev2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"CREATE BRANCH dev FROM main\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", - "\n", - "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchdev2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
1Branchmain2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...\n", - "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"LIST REFERENCES\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tables under dev branch\n", - "-------------------------------------\n", - "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", - "\n", - "We create two tables under the `dev` branch:\n", - "- `salaries`\n", - "- `totals_stats`\n", - "\n", - "These tables list the salaries per player per year and their stats per year.\n", - "\n", - "To create the data we:\n", - "\n", - "1. switch our branch context to dev\n", - "2. create the table\n", - "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE dev\")\n", - "\n", - "# Creating `salaries` table\n", - "spark.sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS salaries (Season STRING, Team STRING, Salary STRING, Player STRING)\n", - " USING delta LOCATION '{}/salaries'\"\"\".format(\n", - " warehouse\n", - " )\n", - ")\n", - "\n", - "spark.sql(\n", - " \"\"\"CREATE OR REPLACE TEMPORARY VIEW salaries_table USING csv\n", - " OPTIONS (path \"../datasets/nba/salaries.csv\", header true)\"\"\"\n", - ")\n", - "spark.sql(\"INSERT INTO salaries SELECT * FROM salaries_table\")\n", - "\n", - "# Creating `totals_stats` table\n", - "spark.sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS totals_stats (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING,\n", - " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", - " USING delta LOCATION '{}/totals_stats'\"\"\".format(\n", - " warehouse\n", - " )\n", - ")\n", - "spark.sql(\n", - " \"\"\"CREATE OR REPLACE TEMPORARY VIEW stats_table USING csv\n", - " OPTIONS (path \"../datasets/nba/totals_stats.csv\", header true)\"\"\"\n", - ")\n", - "spark.sql(\"INSERT INTO totals_stats SELECT * FROM stats_table\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we count the rows in our tables to ensure they are the same number as the csv files." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n", - "92\n" - ] - } - ], - "source": [ - "table_count = spark.sql(\"select count(*) from salaries\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "table_count = spark.sql(\"select count(*) from totals_stats\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check generated tables\n", - "----------------------------\n", - "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", - "let's verify that the `main` branch was not altered by our changes.\n", - "\n", - "Note: `SHOW TABLES` does not work on Delta because the Delta Catalog has no concept of references. We have to use the command line instead.\n", - "In this demo we are switching the reference around regularly which means `SHOW TABLES` isn't always reliable. In the situation where\n", - "your Spark job is only using one reference we can safely call `SHOW TABLES`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And on the `dev` branch we expect to see two tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DELTA_LAKE_TABLE:\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list --ref dev" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can also verify that the `dev` and `main` branches point to different commits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchdev54622696d1313cfcb012120083d917f65558f0906f73ab...
1Branchmain2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch dev 54622696d1313cfcb012120083d917f65558f0906f73ab...\n", - "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"LIST REFERENCES\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dev promotion into main\n", - "-----------------------\n", - "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", - "We merge `dev` into `main` via the Spark sql `merge` command.\n", - "Both branches should be at the same revision after merging/promotion." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namehash
0maindee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921...
\n", - "
" - ], - "text/plain": [ - " name hash\n", - "0 main dee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"MERGE BRANCH dev\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can verify that the `main` branch now contains the expected tables and row counts.\n", - "\n", - "The tables are now on `main` and ready for consumption by our blog authors and analysts!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchmaindee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921...
1Branchdev54622696d1313cfcb012120083d917f65558f0906f73ab...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch main dee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921...\n", - "1 Branch dev 54622696d1313cfcb012120083d917f65558f0906f73ab..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"LIST REFERENCES\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DELTA_LAKE_TABLE:\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n", - "92\n" - ] - } - ], - "source": [ - "spark.sql(\"USE REFERENCE main\")\n", - "table_count = spark.sql(\"select count(*) from salaries\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "table_count = spark.sql(\"select count(*) from totals_stats\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Perform regular ETL on the new tables\n", - "-------------------\n", - "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", - "\n", - "1. Update the salaries table to add new data\n", - "2. We add `Years` column to `totals_stats` table to show how many years the player was in the league\n", - "3. We create a new table to hold information about the players appearances in all star games\n", - "\n", - "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchetldee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch etl dee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"CREATE BRANCH etl FROM main\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add some salaries for Kevin Durant\n", - "spark.sql(\"USE REFERENCE etl\")\n", - "spark.sql(\n", - " \"\"\"INSERT INTO salaries VALUES\n", - " (\"2017-18\", \"Golden State Warriors\", \"$25000000\", \"Kevin Durant\"),\n", - " (\"2018-19\", \"Golden State Warriors\", \"$30000000\", \"Kevin Durant\"),\n", - " (\"2019-20\", \"Brooklyn Nets\", \"$37199000\", \"Kevin Durant\"),\n", - " (\"2020-21\", \"Brooklyn Nets\", \"$39058950\", \"Kevin Durant\")\n", - " \"\"\"\n", - ").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Adding a column in the `totals_stats` table\n", - "spark.sql(\"ALTER TABLE totals_stats ADD COLUMNS (Years STRING)\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count(1)
047
\n", - "
" - ], - "text/plain": [ - " count(1)\n", - "0 47" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Creating `allstar_games_stats` table and viewing the contents\n", - "spark.sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS allstar_games_stats (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING,\n", - " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", - " USING delta LOCATION '{}/allstar_stats'\"\"\".format(\n", - " warehouse\n", - " )\n", - ")\n", - "spark.sql(\n", - " \"\"\"CREATE OR REPLACE TEMPORARY VIEW allstar_table USING csv\n", - " OPTIONS (path \"../datasets/nba/allstar_games_stats.csv\", header true)\"\"\"\n", - ")\n", - "spark.sql(\"INSERT INTO allstar_games_stats SELECT * FROM allstar_table\").toPandas()\n", - "\n", - "# notice how we view the data on the etl branch via @etl\n", - "spark.sql(\"select count(*) from allstar_games_stats\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can verify that the new table isn't on the `main` branch but is present on the etl branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DELTA_LAKE_TABLE:\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DELTA_LAKE_TABLE:\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.allstar_stats._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list --ref etl" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we are happy with the data we can again merge it into `main`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namehash
0maincdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9...
\n", - "
" - ], - "text/plain": [ - " name hash\n", - "0 main cdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"MERGE BRANCH etl\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now lets verify that the changes exist on the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DELTA_LAKE_TABLE:\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.allstar_stats._delta_log\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchmaincdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9...
1Branchetl2d11823828ee539d7609e1a88083ada6f37d39362a4e3a...
2Branchdev54622696d1313cfcb012120083d917f65558f0906f73ab...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch main cdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9...\n", - "1 Branch etl 2d11823828ee539d7609e1a88083ada6f37d39362a4e3a...\n", - "2 Branch dev 54622696d1313cfcb012120083d917f65558f0906f73ab..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"LIST REFERENCES\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "47\n" - ] - } - ], - "source": [ - "spark.sql(\"USE REFERENCE main\").toPandas()\n", - "table_count = spark.sql(\"select count(*) from allstar_games_stats\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from allstar_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create `experiment` branch\n", - "--------------------------------\n", - "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", - "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", - "and carry out our experiment, which could consist of the following steps:\n", - "- drop `totals_stats` table\n", - "- add data to `salaries` table\n", - "- compare `experiment` and `main` tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchexperimentcdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch experiment cdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"CREATE BRANCH experiment FROM main\").toPandas()\n", - "spark.sql(\"USE REFERENCE experiment\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Drop the `totals_stats` table on the `experiment` branch\n", - "spark.sql(\"DROP TABLE totals_stats\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add some salaries for Dirk Nowitzki\n", - "spark.sql(\n", - " \"\"\"INSERT INTO salaries VALUES\n", - " (\"2015-16\", \"Dallas Mavericks\", \"$8333333\", \"Dirk Nowitzki\"),\n", - " (\"2016-17\", \"Dallas Mavericks\", \"$25000000\", \"Dirk Nowitzki\"),\n", - " (\"2017-28\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\"),\n", - " (\"2018-19\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\")\n", - " \"\"\"\n", - ").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DELTA_LAKE_TABLE:\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.allstar_stats._delta_log\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list --ref experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "DELTA_LAKE_TABLE:\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n", - "\thome.jovyan.notebooks.spark_warehouse.delta.allstar_stats._delta_log\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n", - "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count(1)
058
\n", - "
" - ], - "text/plain": [ - " count(1)\n", - "0 58" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"select count(*) from salaries\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n", - "to the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count(1)
054
\n", - "
" - ], - "text/plain": [ - " count(1)\n", - "0 54" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE main\").toPandas()\n", - "spark.sql(\"select count(*) from salaries\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "And finally lets clean up after ourselves" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[status: string]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"DROP BRANCH dev\")\n", - "spark.sql(\"DROP BRANCH etl\")\n", - "spark.sql(\"DROP BRANCH experiment\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file diff --git a/notebooks/nessie-iceberg-demo-nba.ipynb b/notebooks/nessie-iceberg-demo-nba.ipynb index 7b2632a0..051ddec8 100644 --- a/notebooks/nessie-iceberg-demo-nba.ipynb +++ b/notebooks/nessie-iceberg-demo-nba.ipynb @@ -1,1996 +1,668 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Nessie Spark SQL Demo with NBA Dataset\n", - "============================\n", - "This demo showcases how to use Nessie Python API along with Spark3 from Iceberg\n", - "\n", - "Initialize Pyspark\n", - "----------------------------------------------\n", - "To get started, we will first have to do a few setup steps that give us everything we need\n", - "to get started with Nessie. In case you're interested in the detailed setup steps for Spark, you can check out the [docs](https://projectnessie.org/tools/iceberg/spark/).\n", - "\n", - "The Binder server has downloaded spark and some data for us as well as started a Nessie server in the background. All we have to do is start Spark.\n", - "\n", - "The below cell starts a local Spark session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: An illegal reflective access operation has occurred\n", - "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", - "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", - "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", - "WARNING: All illegal access operations will be denied in a future release\n", - "Ivy Default Cache set to: /home/jovyan/.ivy2/cache\n", - "The jars for the packages stored in: /home/jovyan/.ivy2/jars\n", - "org.apache.iceberg#iceberg-spark-runtime-3.2_2.12 added as a dependency\n", - "org.projectnessie#nessie-spark-3.2-extensions added as a dependency\n", - ":: resolving dependencies :: org.apache.spark#spark-submit-parent-6cba98e4-6e15-458e-a366-568683d289f7;1.0\n", - "\tconfs: [default]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":: loading settings :: url = jar:file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\tfound org.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.13.1 in central\n", - "\tfound org.projectnessie#nessie-spark-3.2-extensions;0.30.0 in central\n", - "downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/0.13.1/iceberg-spark-runtime-3.2_2.12-0.13.1.jar ...\n", - "\t[SUCCESSFUL ] org.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.13.1!iceberg-spark-runtime-3.2_2.12.jar (1331ms)\n", - "downloading https://repo1.maven.org/maven2/org/projectnessie/nessie-spark-3.2-extensions/0.30.0/nessie-spark-3.2-extensions-0.30.0.jar ...\n", - "\t[SUCCESSFUL ] org.projectnessie#nessie-spark-3.2-extensions;0.30.0!nessie-spark-3.2-extensions.jar (70ms)\n", - ":: resolution report :: resolve 13309ms :: artifacts dl 1405ms\n", - "\t:: modules in use:\n", - "\torg.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.13.1 from central in [default]\n", - "\torg.projectnessie#nessie-spark-3.2-extensions;0.30.0 from central in [default]\n", - "\t---------------------------------------------------------------------\n", - "\t| | modules || artifacts |\n", - "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", - "\t---------------------------------------------------------------------\n", - "\t| default | 2 | 2 | 2 | 0 || 2 | 2 |\n", - "\t---------------------------------------------------------------------\n", - ":: retrieving :: org.apache.spark#spark-submit-parent-6cba98e4-6e15-458e-a366-568683d289f7\n", - "\tconfs: [default]\n", - "\t2 artifacts copied, 0 already retrieved (22360kB/20ms)\n", - "22/05/24 07:43:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", - "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", - "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Spark Running\n" - ] - } - ], - "source": [ - "import os\n", - "import findspark\n", - "from pyspark.sql import *\n", - "from pyspark import SparkConf\n", - "import pynessie\n", - "\n", - "findspark.init()\n", - "pynessie_version = pynessie.__version__\n", - "\n", - "conf = SparkConf()\n", - "# we need iceberg libraries and the nessie sql extensions\n", - "conf.set(\n", - " \"spark.jars.packages\",\n", - " f\"org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.1,org.projectnessie:nessie-spark-3.2-extensions:{pynessie_version}\",\n", - ")\n", - "# ensure python <-> java interactions are w/ pyarrow\n", - "conf.set(\"spark.sql.execution.pyarrow.enabled\", \"true\")\n", - "# create catalog dev_catalog as an iceberg catalog\n", - "conf.set(\"spark.sql.catalog.dev_catalog\", \"org.apache.iceberg.spark.SparkCatalog\")\n", - "# tell the dev_catalog that its a Nessie catalog\n", - "conf.set(\"spark.sql.catalog.dev_catalog.catalog-impl\", \"org.apache.iceberg.nessie.NessieCatalog\")\n", - "# set the location for Nessie catalog to store data. Spark writes to this directory\n", - "conf.set(\"spark.sql.catalog.dev_catalog.warehouse\", \"file://\" + os.getcwd() + \"/spark_warehouse/iceberg\")\n", - "# set the location of the nessie server. In this demo its running locally. There are many ways to run it (see https://projectnessie.org/try/)\n", - "conf.set(\"spark.sql.catalog.dev_catalog.uri\", \"http://localhost:19120/api/v1\")\n", - "# default branch for Nessie catalog to work on\n", - "conf.set(\"spark.sql.catalog.dev_catalog.ref\", \"main\")\n", - "# use no authorization. Options are NONE AWS BASIC and aws implies running Nessie on a lambda\n", - "conf.set(\"spark.sql.catalog.dev_catalog.auth_type\", \"NONE\")\n", - "# enable the extensions for both Nessie and Iceberg\n", - "conf.set(\n", - " \"spark.sql.extensions\",\n", - " \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSpark32SessionExtensions\",\n", - ")\n", - "# finally, start up the Spark server\n", - "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", - "print(\"Spark Running\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Solving Data Engineering problems with Nessie\n", - "============================\n", - "\n", - "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", - "\n", - "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set up Nessie branches\n", - "----------------------------\n", - "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", - "\n", - "- Create a new branch named `dev`\n", - "- List all branches\n", - "\n", - "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchdev2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"CREATE BRANCH dev IN dev_catalog FROM main\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", - "\n", - "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchdev2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
1Branchmain2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...\n", - "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tables under dev branch\n", - "-------------------------------------\n", - "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", - "\n", - "We create two tables under the `dev` branch:\n", - "- `salaries`\n", - "- `totals_stats`\n", - "\n", - "These tables list the salaries per player per year and their stats per year.\n", - "\n", - "To create the data we:\n", - "\n", - "1. switch our branch context to dev\n", - "2. create the table\n", - "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE dev IN dev_catalog\")\n", - "\n", - "# Creating `salaries` table\n", - "spark.sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.salaries\n", - " (Season STRING, Team STRING, Salary STRING, Player STRING) USING iceberg\"\"\"\n", - ")\n", - "\n", - "spark.sql(\n", - " \"\"\"CREATE OR REPLACE TEMPORARY VIEW salaries_table USING csv\n", - " OPTIONS (path \"../datasets/nba/salaries.csv\", header true)\"\"\"\n", - ")\n", - "spark.sql(\"INSERT INTO dev_catalog.nba.salaries SELECT * FROM salaries_table\")\n", - "\n", - "# Creating `totals_stats` table\n", - "spark.sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.totals_stats (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING,\n", - " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", - " USING iceberg\"\"\"\n", - ")\n", - "spark.sql(\n", - " \"\"\"CREATE OR REPLACE TEMPORARY VIEW stats_table USING csv\n", - " OPTIONS (path \"../datasets/nba/totals_stats.csv\", header true)\"\"\"\n", - ")\n", - "spark.sql(\"INSERT INTO dev_catalog.nba.totals_stats SELECT * FROM stats_table\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by a `USE REFERENCE` command." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n", - "92\n" - ] - } - ], - "source": [ - "table_count = spark.sql(\"select count(*) from dev_catalog.nba.`salaries@dev`\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "table_count = spark.sql(\"select count(*) from dev_catalog.nba.`totals_stats@dev`\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check generated tables\n", - "----------------------------\n", - "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", - "let's verify that the `main` branch was not altered by our changes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [namespace, tableName, isTemporary]\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", - "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And on the `dev` branch we expect to see two tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbatotals_statsFalse
1nbasalariesFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba totals_stats False\n", - "1 nba salaries False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE dev IN dev_catalog\").toPandas()\n", - "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can also verify that the `dev` and `main` branches point to different commits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchdev70a8df769b477de5b9157691edef1efca8a640ae9f7137...
1Branchmain2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch dev 70a8df769b477de5b9157691edef1efca8a640ae9f7137...\n", - "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dev promotion into main\n", - "-----------------------\n", - "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", - "We merge `dev` into `main` via the Spark sql `merge` command.\n", - "Both branches should be at the same revision after merging/promotion." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namehash
0mainaf5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38...
\n", - "
" - ], - "text/plain": [ - " name hash\n", - "0 main af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"MERGE BRANCH dev INTO main IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can verify that the `main` branch now contains the expected tables and row counts.\n", - "\n", - "The tables are now on `main` and ready for consumption by our blog authors and analysts!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchmainaf5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38...
1Branchdev70a8df769b477de5b9157691edef1efca8a640ae9f7137...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch main af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38...\n", - "1 Branch dev 70a8df769b477de5b9157691edef1efca8a640ae9f7137..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbatotals_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba totals_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", - "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n", - "92\n" - ] - } - ], - "source": [ - "table_count = spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "table_count = spark.sql(\"select count(*) from dev_catalog.nba.totals_stats\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Perform regular ETL on the new tables\n", - "-------------------\n", - "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", - "\n", - "1. Update the salaries table to add new data\n", - "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", - "3. We create a new table to hold information about the players appearances in all star games\n", - "\n", - "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchetlaf5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch etl af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"CREATE BRANCH etl IN dev_catalog FROM main\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add some salaries for Kevin Durant\n", - "spark.sql(\"USE REFERENCE etl IN dev_catalog\")\n", - "spark.sql(\n", - " \"\"\"INSERT INTO dev_catalog.nba.salaries VALUES\n", - " (\"2017-18\", \"Golden State Warriors\", \"$25000000\", \"Kevin Durant\"),\n", - " (\"2018-19\", \"Golden State Warriors\", \"$30000000\", \"Kevin Durant\"),\n", - " (\"2019-20\", \"Brooklyn Nets\", \"$37199000\", \"Kevin Durant\"),\n", - " (\"2020-21\", \"Brooklyn Nets\", \"$39058950\", \"Kevin Durant\")\n", - " \"\"\"\n", - ").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Dropping a column in the `totals_stats` table\n", - "spark.sql(\"ALTER TABLE dev_catalog.nba.totals_stats DROP COLUMN Age\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count(1)
047
\n", - "
" - ], - "text/plain": [ - " count(1)\n", - "0 47" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Creating `allstar_games_stats` table and viewing the contents\n", - "spark.sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.allstar_games_stats (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING,\n", - " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", - " USING iceberg\"\"\"\n", - ")\n", - "spark.sql(\n", - " \"\"\"CREATE OR REPLACE TEMPORARY VIEW allstar_table USING csv\n", - " OPTIONS (path \"../datasets/nba/allstar_games_stats.csv\", header true)\"\"\"\n", - ")\n", - "spark.sql(\"INSERT INTO dev_catalog.nba.allstar_games_stats SELECT * FROM allstar_table\").toPandas()\n", - "\n", - "# notice how we view the data on the etl branch via @etl\n", - "spark.sql(\"select count(*) from dev_catalog.nba.`allstar_games_stats@etl`\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can verify that the new table isn't on the `main` branch but is present on the etl branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbatotals_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba totals_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", - "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbaallstar_games_statsFalse
1nbatotals_statsFalse
2nbasalariesFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba allstar_games_stats False\n", - "1 nba totals_stats False\n", - "2 nba salaries False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE etl IN dev_catalog\").toPandas()\n", - "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we are happy with the data we can again merge it into `main`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namehash
0maine0f4167c8947d15161a8fedc376da020f1ecff63c5eea2...
\n", - "
" - ], - "text/plain": [ - " name hash\n", - "0 main e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"MERGE BRANCH etl INTO main IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now lets verify that the changes exist on the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbaallstar_games_statsFalse
2nbatotals_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba allstar_games_stats False\n", - "2 nba totals_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", - "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchmaine0f4167c8947d15161a8fedc376da020f1ecff63c5eea2...
1Branchetl957c1254ab0a3e3bd1e306669ebe7073e27a97966bcfda...
2Branchdev70a8df769b477de5b9157691edef1efca8a640ae9f7137...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch main e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2...\n", - "1 Branch etl 957c1254ab0a3e3bd1e306669ebe7073e27a97966bcfda...\n", - "2 Branch dev 70a8df769b477de5b9157691edef1efca8a640ae9f7137..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "47\n" - ] - } - ], - "source": [ - "table_count = spark.sql(\"select count(*) from dev_catalog.nba.allstar_games_stats\").toPandas().values[0][0]\n", - "csv_count = spark.sql(\"select count(*) from allstar_table\").toPandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create `experiment` branch\n", - "--------------------------------\n", - "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", - "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", - "and carry out our experiment, which could consist of the following steps:\n", - "- drop `totals_stats` table\n", - "- add data to `salaries` table\n", - "- compare `experiment` and `main` tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchexperimente0f4167c8947d15161a8fedc376da020f1ecff63c5eea2...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch experiment e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"CREATE BRANCH experiment IN dev_catalog FROM main\").toPandas()\n", - "spark.sql(\"USE REFERENCE experiment IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Drop the `totals_stats` table on the `experiment` branch\n", - "spark.sql(\"DROP TABLE dev_catalog.nba.totals_stats\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# add some salaries for Dirk Nowitzki\n", - "spark.sql(\n", - " \"\"\"INSERT INTO dev_catalog.nba.salaries VALUES\n", - " (\"2015-16\", \"Dallas Mavericks\", \"$8333333\", \"Dirk Nowitzki\"),\n", - " (\"2016-17\", \"Dallas Mavericks\", \"$25000000\", \"Dirk Nowitzki\"),\n", - " (\"2017-28\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\"),\n", - " (\"2018-19\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\")\n", - " \"\"\"\n", - ").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbaallstar_games_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba allstar_games_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbaallstar_games_statsFalse
2nbatotals_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba allstar_games_stats False\n", - "2 nba totals_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", - "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n", - "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count(1)
058
\n", - "
" - ], - "text/plain": [ - " count(1)\n", - "0 58" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"select count(*) from dev_catalog.nba.`salaries@experiment`\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n", - "to the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count(1)
054
\n", - "
" - ], - "text/plain": [ - " count(1)\n", - "0 54" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "And finally lets clean up after ourselves" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[status: string]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "spark.sql(\"DROP BRANCH dev IN dev_catalog\")\n", - "spark.sql(\"DROP BRANCH etl IN dev_catalog\")\n", - "spark.sql(\"DROP BRANCH experiment IN dev_catalog\")" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nessie Spark SQL Demo with NBA Dataset\n", + "============================\n", + "This demo showcases how to use Nessie Python API along with Spark3 from Iceberg\n", + "\n", + "Initialize Pyspark\n", + "----------------------------------------------\n", + "To get started, we will first have to do a few setup steps that give us everything we need\n", + "to get started with Nessie. In case you're interested in the detailed setup steps for Spark, you can check out the [docs](https://projectnessie.org/tools/iceberg/spark/).\n", + "\n", + "The Binder server has downloaded spark and some data for us as well as started a Nessie server in the background. All we have to do is start Spark.\n", + "\n", + "The below cell starts a local Spark session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import os\n", + "import findspark\n", + "from pyspark.sql import *\n", + "from pyspark import SparkConf\n", + "import pynessie\n", + "\n", + "findspark.init()\n", + "pynessie_version = pynessie.__version__\n", + "\n", + "conf = SparkConf()\n", + "# we need iceberg libraries and the nessie sql extensions\n", + "conf.set(\n", + " \"spark.jars.packages\",\n", + " f\"org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.4.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.2_2.12:0.74.0\",\n", + ")\n", + "# ensure python <-> java interactions are w/ pyarrow\n", + "conf.set(\"spark.sql.execution.pyarrow.enabled\", \"true\")\n", + "# create catalog dev_catalog as an iceberg catalog\n", + "conf.set(\"spark.sql.catalog.dev_catalog\", \"org.apache.iceberg.spark.SparkCatalog\")\n", + "# tell the dev_catalog that its a Nessie catalog\n", + "conf.set(\"spark.sql.catalog.dev_catalog.catalog-impl\", \"org.apache.iceberg.nessie.NessieCatalog\")\n", + "# set the location for Nessie catalog to store data. Spark writes to this directory\n", + "conf.set(\"spark.sql.catalog.dev_catalog.warehouse\", \"file://\" + os.getcwd() + \"/spark_warehouse/iceberg\")\n", + "# set the location of the nessie server. In this demo its running locally. There are many ways to run it (see https://projectnessie.org/try/)\n", + "conf.set(\"spark.sql.catalog.dev_catalog.uri\", \"http://localhost:19120/api/v1\")\n", + "# default branch for Nessie catalog to work on\n", + "conf.set(\"spark.sql.catalog.dev_catalog.ref\", \"main\")\n", + "# use no authorization. Options are NONE AWS BASIC and aws implies running Nessie on a lambda\n", + "conf.set(\"spark.sql.catalog.dev_catalog.auth_type\", \"NONE\")\n", + "# enable the extensions for both Nessie and Iceberg\n", + "conf.set(\n", + " \"spark.sql.extensions\",\n", + " \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions\",\n", + ")\n", + "# finally, start up the Spark server\n", + "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", + "print(\"Spark Running\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Solving Data Engineering problems with Nessie\n", + "============================\n", + "\n", + "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", + "\n", + "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set up Nessie branches\n", + "----------------------------\n", + "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", + "\n", + "- Create a new branch named `dev`\n", + "- List all branches\n", + "\n", + "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create the 'nba' namespace in Nessie\n", + "spark.sql(\"CREATE NAMESPACE dev_catalog.nba\")\n", + "\n", + "# Create the 'dev' branch from 'main' branch\n", + "spark.sql(\"CREATE BRANCH dev IN dev_catalog FROM main\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", + "\n", + "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create tables under dev branch\n", + "-------------------------------------\n", + "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", + "\n", + "We create two tables under the `dev` branch:\n", + "- `salaries`\n", + "- `totals_stats`\n", + "\n", + "These tables list the salaries per player per year and their stats per year.\n", + "\n", + "To create the data we:\n", + "\n", + "1. switch our branch context to dev\n", + "2. create the table\n", + "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"USE REFERENCE dev IN dev_catalog\")\n", + "\n", + "# Creating `salaries` table\n", + "spark.sql(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.salaries\n", + " (Season STRING, Team STRING, Salary STRING, Player STRING) USING iceberg\"\"\"\n", + ")\n", + "\n", + "spark.sql(\n", + " \"\"\"CREATE OR REPLACE TEMPORARY VIEW salaries_table USING csv\n", + " OPTIONS (path \"../datasets/nba/salaries.csv\", header true)\"\"\"\n", + ")\n", + "spark.sql(\"INSERT INTO dev_catalog.nba.salaries SELECT * FROM salaries_table\")\n", + "\n", + "# Creating `totals_stats` table\n", + "spark.sql(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.totals_stats (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING,\n", + " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", + " USING iceberg\"\"\"\n", + ")\n", + "spark.sql(\n", + " \"\"\"CREATE OR REPLACE TEMPORARY VIEW stats_table USING csv\n", + " OPTIONS (path \"../datasets/nba/totals_stats.csv\", header true)\"\"\"\n", + ")\n", + "spark.sql(\"INSERT INTO dev_catalog.nba.totals_stats SELECT * FROM stats_table\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by a `USE REFERENCE` command." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "table_count = spark.sql(\"select count(*) from dev_catalog.nba.`salaries@dev`\").toPandas().values[0][0]\n", + "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n", + "assert table_count == csv_count\n", + "print(table_count)\n", + "\n", + "table_count = spark.sql(\"select count(*) from dev_catalog.nba.`totals_stats@dev`\").toPandas().values[0][0]\n", + "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check generated tables\n", + "----------------------------\n", + "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", + "let's verify that the `main` branch was not altered by our changes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", + "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And on the `dev` branch we expect to see two tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"USE REFERENCE dev IN dev_catalog\").toPandas()\n", + "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can also verify that the `dev` and `main` branches point to different commits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dev promotion into main\n", + "-----------------------\n", + "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", + "We merge `dev` into `main` via the Spark sql `merge` command.\n", + "Both branches should be at the same revision after merging/promotion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"MERGE BRANCH dev INTO main IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can verify that the `main` branch now contains the expected tables and row counts.\n", + "\n", + "The tables are now on `main` and ready for consumption by our blog authors and analysts!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", + "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table_count = spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas().values[0][0]\n", + "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n", + "assert table_count == csv_count\n", + "print(table_count)\n", + "\n", + "table_count = spark.sql(\"select count(*) from dev_catalog.nba.totals_stats\").toPandas().values[0][0]\n", + "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Perform regular ETL on the new tables\n", + "-------------------\n", + "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", + "\n", + "1. Update the salaries table to add new data\n", + "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", + "3. We create a new table to hold information about the players appearances in all star games\n", + "\n", + "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"CREATE BRANCH etl IN dev_catalog FROM main\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" + }, + "outputs": [], + "source": [ + "# add some salaries for Kevin Durant\n", + "spark.sql(\"USE REFERENCE etl IN dev_catalog\")\n", + "spark.sql(\n", + " \"\"\"INSERT INTO dev_catalog.nba.salaries VALUES\n", + " (\"2017-18\", \"Golden State Warriors\", \"$25000000\", \"Kevin Durant\"),\n", + " (\"2018-19\", \"Golden State Warriors\", \"$30000000\", \"Kevin Durant\"),\n", + " (\"2019-20\", \"Brooklyn Nets\", \"$37199000\", \"Kevin Durant\"),\n", + " (\"2020-21\", \"Brooklyn Nets\", \"$39058950\", \"Kevin Durant\")\n", + " \"\"\"\n", + ").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dropping a column in the `totals_stats` table\n", + "spark.sql(\"ALTER TABLE dev_catalog.nba.totals_stats DROP COLUMN Age\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating `allstar_games_stats` table and viewing the contents\n", + "spark.sql(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.allstar_games_stats (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING,\n", + " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", + " USING iceberg\"\"\"\n", + ")\n", + "spark.sql(\n", + " \"\"\"CREATE OR REPLACE TEMPORARY VIEW allstar_table USING csv\n", + " OPTIONS (path \"../datasets/nba/allstar_games_stats.csv\", header true)\"\"\"\n", + ")\n", + "spark.sql(\"INSERT INTO dev_catalog.nba.allstar_games_stats SELECT * FROM allstar_table\").toPandas()\n", + "\n", + "# notice how we view the data on the etl branch via @etl\n", + "spark.sql(\"select count(*) from dev_catalog.nba.`allstar_games_stats@etl`\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can verify that the new table isn't on the `main` branch but is present on the etl branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", + "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"USE REFERENCE etl IN dev_catalog\").toPandas()\n", + "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we are happy with the data we can again merge it into `main`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "spark.sql(\"MERGE BRANCH etl INTO main IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now lets verify that the changes exist on the `main` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", + "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table_count = spark.sql(\"select count(*) from dev_catalog.nba.allstar_games_stats\").toPandas().values[0][0]\n", + "csv_count = spark.sql(\"select count(*) from allstar_table\").toPandas().values[0][0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create `experiment` branch\n", + "--------------------------------\n", + "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", + "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", + "and carry out our experiment, which could consist of the following steps:\n", + "- drop `totals_stats` table\n", + "- add data to `salaries` table\n", + "- compare `experiment` and `main` tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"CREATE BRANCH experiment IN dev_catalog FROM main\").toPandas()\n", + "spark.sql(\"USE REFERENCE experiment IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the `totals_stats` table on the `experiment` branch\n", + "spark.sql(\"DROP TABLE dev_catalog.nba.totals_stats\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add some salaries for Dirk Nowitzki\n", + "spark.sql(\n", + " \"\"\"INSERT INTO dev_catalog.nba.salaries VALUES\n", + " (\"2015-16\", \"Dallas Mavericks\", \"$8333333\", \"Dirk Nowitzki\"),\n", + " (\"2016-17\", \"Dallas Mavericks\", \"$25000000\", \"Dirk Nowitzki\"),\n", + " (\"2017-28\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\"),\n", + " (\"2018-19\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\")\n", + " \"\"\"\n", + ").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", + "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n", + "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "spark.sql(\"select count(*) from dev_catalog.nba.`salaries@experiment`\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n", + "to the `main` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "And finally lets clean up after ourselves" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" } + }, + "outputs": [], + "source": [ + "spark.sql(\"DROP BRANCH dev IN dev_catalog\")\n", + "spark.sql(\"DROP BRANCH etl IN dev_catalog\")\n", + "spark.sql(\"DROP BRANCH experiment IN dev_catalog\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/nessie-iceberg-flink-demo-nba.ipynb b/notebooks/nessie-iceberg-flink-demo-nba.ipynb index 16646608..c94d5660 100644 --- a/notebooks/nessie-iceberg-flink-demo-nba.ipynb +++ b/notebooks/nessie-iceberg-flink-demo-nba.ipynb @@ -1,2014 +1,763 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Nessie Iceberg/Flink SQL Demo with NBA Dataset\n", - "============================\n", - "This demo showcases how to use Nessie Python API along with Flink from Iceberg\n", - "\n", - "Initialize PyFlink\n", - "----------------------------------------------\n", - "To get started, we will first have to do a few setup steps that give us everything we need\n", - "to get started with Nessie. In case you're interested in the detailed setup steps for Flink, you can check out the [docs](https://projectnessie.org/tools/iceberg/flink/)\n", - "\n", - "The Binder server has downloaded flink and some data for us as well as started a Nessie server in the background. All we have to do is start Flink\n", - "\n", - "The below cell starts a local Flink session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "SLF4J: Class path contains multiple SLF4J bindings.\n", - "SLF4J: Found binding in [jar:file:/srv/conda/envs/flink-demo/lib/python3.7/site-packages/pyflink/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n", - "SLF4J: Found binding in [jar:file:/srv/conda/envs/flink-demo/lib/python3.7/site-packages/pyflink/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n", - "SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n", - "SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "Flink running\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "import os\n", - "from pyflink.datastream import StreamExecutionEnvironment\n", - "from pyflink.table import StreamTableEnvironment\n", - "from pyflink.table.expressions import lit\n", - "from pynessie import init\n", - "\n", - "# where we will store our data\n", - "warehouse = os.path.join(os.getcwd(), \"flink-warehouse\")\n", - "# this was downloaded when Binder started, its available on maven central\n", - "iceberg_flink_runtime_jar = os.path.join(os.getcwd(), \"../iceberg-flink-runtime-1.13-0.13.1.jar\")\n", - "assert os.path.exists(iceberg_flink_runtime_jar)\n", - "\n", - "env = StreamExecutionEnvironment.get_execution_environment()\n", - "env.add_jars(\"file://{}\".format(iceberg_flink_runtime_jar))\n", - "table_env = StreamTableEnvironment.create(env)\n", - "\n", - "nessie_client = init()\n", - "\n", - "\n", - "def create_ref_catalog(ref):\n", - " \"\"\"\n", - " Create a flink catalog that is tied to a specific ref.\n", - "\n", - " In order to create the catalog we have to first create the branch\n", - " \"\"\"\n", - " default_branch = nessie_client.get_default_branch()\n", - " if ref != default_branch:\n", - " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n", - " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n", - " # The important args below are:\n", - " # type - tell Flink to use Iceberg as the catalog\n", - " # catalog-impl - which Iceberg catalog to use, in this case we want Nessie\n", - " # uri - the location of the nessie server.\n", - " # ref - the Nessie ref/branch we want to use (defaults to main)\n", - " # warehouse - the location this catalog should store its data\n", - " table_env.execute_sql(\n", - " f\"\"\"CREATE CATALOG {ref}_catalog WITH (\n", - " 'type'='iceberg',\n", - " 'catalog-impl'='org.apache.iceberg.nessie.NessieCatalog',\n", - " 'uri'='http://localhost:19120/api/v1',\n", - " 'ref'='{ref}',\n", - " 'warehouse' = '{warehouse}')\"\"\"\n", - " )\n", - "\n", - "\n", - "create_ref_catalog(nessie_client.get_default_branch())\n", - "print(\"\\n\\n\\nFlink running\\n\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Solving Data Engineering problems with Nessie\n", - "============================\n", - "\n", - "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", - "\n", - "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set up Nessie branches (via Nessie CLI)\n", - "----------------------------\n", - "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", - "\n", - "- Create a new branch named `dev`\n", - "- List all branches\n", - "\n", - "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "create_ref_catalog(\"dev\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", - "\n", - "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[0m\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tables under dev branch\n", - "-------------------------------------\n", - "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", - "\n", - "We create two tables under the `dev` branch:\n", - "- `salaries`\n", - "- `totals_stats`\n", - "\n", - "These tables list the salaries per player per year and their stats per year.\n", - "\n", - "To create the data we:\n", - "\n", - "1. switch our branch context to dev\n", - "2. create the table\n", - "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: An illegal reflective access operation has occurred\n", - "WARNING: Illegal reflective access by org.apache.hadoop.security.authentication.util.KerberosUtil (file:/srv/conda/envs/flink-demo/lib/python3.7/site-packages/pyflink/lib/hadoop-auth-2.10.1.jar) to method sun.security.krb5.Config.getInstance()\n", - "WARNING: Please consider reporting this to the maintainers of org.apache.hadoop.security.authentication.util.KerberosUtil\n", - "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", - "WARNING: All illegal access operations will be denied in a future release\n", - "log4j:WARN No appenders could be found for logger (org.apache.htrace.core.Tracer).\n", - "log4j:WARN Please initialize the log4j system properly.\n", - "log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,465 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,465 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,465 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,664 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,664 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,665 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,665 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "\n", - "\n", - "\n", - "Added 51 rows to the salaries table and 93 rows to the totals_stats table.\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "# Load the dataset\n", - "from pyflink.table import DataTypes\n", - "from pyflink.table.descriptors import Schema, OldCsv, FileSystem\n", - "\n", - "# Creating `salaries` table\n", - "(\n", - " table_env.connect(FileSystem().path(\"../datasets/nba/salaries.csv\"))\n", - " .with_format(\n", - " OldCsv()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"Salary\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " )\n", - " .with_schema(\n", - " Schema()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"Salary\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " )\n", - " .create_temporary_table(\"dev_catalog.nba.salaries_temp\")\n", - ")\n", - "\n", - "table_env.execute_sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.salaries\n", - " (Season STRING, Team STRING, Salary STRING, Player STRING)\"\"\"\n", - ").wait()\n", - "\n", - "tab = table_env.from_path(\"dev_catalog.nba.salaries_temp\")\n", - "tab.execute_insert(\"dev_catalog.nba.salaries\").wait()\n", - "\n", - "# Creating `totals_stats` table\n", - "(\n", - " table_env.connect(FileSystem().path(\"../datasets/nba/totals_stats.csv\"))\n", - " .with_format(\n", - " OldCsv()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Age\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"ORB\", DataTypes.STRING())\n", - " .field(\"DRB\", DataTypes.STRING())\n", - " .field(\"TRB\", DataTypes.STRING())\n", - " .field(\"AST\", DataTypes.STRING())\n", - " .field(\"STL\", DataTypes.STRING())\n", - " .field(\"BLK\", DataTypes.STRING())\n", - " .field(\"TOV\", DataTypes.STRING())\n", - " .field(\"PTS\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " .field(\"RSorPO\", DataTypes.STRING())\n", - " )\n", - " .with_schema(\n", - " Schema()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Age\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"ORB\", DataTypes.STRING())\n", - " .field(\"DRB\", DataTypes.STRING())\n", - " .field(\"TRB\", DataTypes.STRING())\n", - " .field(\"AST\", DataTypes.STRING())\n", - " .field(\"STL\", DataTypes.STRING())\n", - " .field(\"BLK\", DataTypes.STRING())\n", - " .field(\"TOV\", DataTypes.STRING())\n", - " .field(\"PTS\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " .field(\"RSorPO\", DataTypes.STRING())\n", - " )\n", - " .create_temporary_table(\"dev_catalog.nba.totals_stats_temp\")\n", - ")\n", - "\n", - "table_env.execute_sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.totals_stats (Season STRING, Age STRING, Team STRING,\n", - " ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING, PTS STRING,\n", - " Player STRING, RSorPO STRING)\"\"\"\n", - ").wait()\n", - "\n", - "tab = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\")\n", - "tab.execute_insert(\"dev_catalog.nba.totals_stats\").wait()\n", - "\n", - "salaries = table_env.from_path(\"main_catalog.nba.`salaries@dev`\").select(lit(1).count).to_pandas().values[0][0]\n", - "totals_stats = table_env.from_path(\"main_catalog.nba.`totals_stats@dev`\").select(lit(1).count).to_pandas().values[0][0]\n", - "print(f\"\\n\\n\\nAdded {salaries} rows to the salaries table and {totals_stats} rows to the totals_stats table.\\n\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by the catalog." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:04,807 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,869 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,872 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,874 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,876 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,879 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,881 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,883 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "51\n", - "2022-05-24 07:45:06,280 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,344 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,347 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,351 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,354 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,357 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,360 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,364 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "93\n" - ] - } - ], - "source": [ - "table_count = table_env.from_path(\"dev_catalog.nba.`salaries@dev`\").select(\"Season.count\").to_pandas().values[0][0]\n", - "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "table_count = table_env.from_path(\"dev_catalog.nba.`totals_stats@dev`\").select(\"Season.count\").to_pandas().values[0][0]\n", - "csv_count = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check generated tables\n", - "----------------------------\n", - "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", - "let's verify that the `main` branch was not altered by our changes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "And on the `dev` branch we expect to see two tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.totals_stats\n", - "\tnba.salaries\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list --ref dev" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can also verify that the `dev` and `main` branches point to different commits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dev f48b93594ddead3a7616a271d657f0fff97cd0c4c04d4a579fa165aa96a69908\n", - "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[0m\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dev promotion into main\n", - "-----------------------\n", - "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", - "We merge `dev` into `main` via the command line `merge` command.\n", - "Both branches should be at the same revision after merging/promotion." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie merge dev -b main --force" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can verify that the `main` branch now contains the expected tables and row counts.\n", - "\n", - "The tables are now on `main` and ready for consumption by our blog authors and analysts!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33m* main facfd43be1d062734ca0cda5ae900dde398180bf3f370a19627da8a2419589b0\n", - "\u001b[0m dev f48b93594ddead3a7616a271d657f0fff97cd0c4c04d4a579fa165aa96a69908\n", - "\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:10,661 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,724 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,725 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,727 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,729 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,730 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,732 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,733 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,239 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,304 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,307 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,312 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,316 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,319 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,322 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,326 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n" - ] - } - ], - "source": [ - "table_count = table_env.from_path(\"main_catalog.nba.salaries\").select(\"Season.count\").to_pandas().values[0][0]\n", - "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - "assert table_count == csv_count\n", - "\n", - "table_count = table_env.from_path(\"main_catalog.nba.totals_stats\").select(\"Season.count\").to_pandas().values[0][0]\n", - "csv_count = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - "assert table_count == csv_count" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Perform regular ETL on the new tables\n", - "-------------------\n", - "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", - "\n", - "1. Update the salaries table to add new data\n", - "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", - "3. We create a new table to hold information about the players appearances in all star games\n", - "\n", - "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "create_ref_catalog(\"etl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:13,368 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n" - ] - } - ], - "source": [ - "# add some salaries for Kevin Durant\n", - "table_env.execute_sql(\n", - " \"\"\"INSERT INTO etl_catalog.nba.salaries\n", - " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n", - " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n", - " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n", - " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n", - ").wait()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Rename the table `totals_stats` to `new_totals_stats`\n", - "table_env.execute_sql(\"ALTER TABLE etl_catalog.nba.totals_stats RENAME TO etl_catalog.nba.new_totals_stats\").wait()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:15,480 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,543 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,546 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,549 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,551 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,554 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,557 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,560 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SeasonAgeTeamORBTRBASTSTLBLKTOVPFPTSPlayer
02004-0526LAL367314516Kobe Bryant
12005-0627LAL07830358Kobe Bryant
22006-0728LAL156604131Kobe Bryant
32007-0829LAL01000000Kobe Bryant
42008-0930LAL144401027Kobe Bryant
52009-1031LALKobe Bryant
62009-1025CLE156402125Lebron James
72010-1126MIA21210004329Lebron James
82011-1227MIA067004236Lebron James
92012-1328MIA035104019Lebron James
102013-1429MIA177301022Lebron James
112014-1530CLE157204130Lebron James
122010-1132LAL10143304237Kobe Bryant
132011-1233LAL011201227Kobe Bryant
142012-1334LAL24822129Kobe Bryant
152013-1435LALKobe Bryant
162014-1536LALKobe Bryant
172015-1637LAL167101110Kobe Bryant
181997-9819LAL261201118Kobe Bryant
191999-0021LAL113201315Kobe Bryant
202000-0122LAL247103319Kobe Bryant
212001-0223LAL255100231Kobe Bryant
222002-0324LAL276325522Kobe Bryant
232003-0425LAL144516320Kobe Bryant
24SeasonAgeTeamORBTRBASTSTLBLKTOVPFPTSPlayer
252004-0520CLE186203013Lebron James
262005-0621CLE262201229Lebron James
272006-0722CLE066104028Lebron James
282007-0823CLE189224327Lebron James
292008-0924CLE053003020Lebron James
301992-9329CHI345406530Michael Jordan
311995-9632CHI141100120Michael Jordan
321996-9733CHI31111203414Michael Jordan
331997-9834CHI168302023Michael Jordan
342001-0238WAS04320118Michael Jordan
352002-0339WAS252202320Michael Jordan
362015-1631CLE047004013Lebron James
372016-1732CLE031004223Lebron James
382017-1833CLE0108105229Lebron James
392018-1934LAL284021119Lebron James
401984-8521CHI36231147Michael Jordan
411985-8622CHIMichael Jordan
421986-8723CHI004205211Michael Jordan
431987-8824CHI383442540Michael Jordan
441988-8925CHI123504128Michael Jordan
451989-9026CHI152515117Michael Jordan
461990-9127CHI3552010226Michael Jordan
471991-9228CHI115201218Michael Jordan
\n", - "
" - ], - "text/plain": [ - " Season Age Team ORB TRB AST STL BLK TOV PF PTS Player\n", - "0 2004-05 26 LAL 3 6 7 3 1 4 5 16 Kobe Bryant\n", - "1 2005-06 27 LAL 0 7 8 3 0 3 5 8 Kobe Bryant\n", - "2 2006-07 28 LAL 1 5 6 6 0 4 1 31 Kobe Bryant\n", - "3 2007-08 29 LAL 0 1 0 0 0 0 0 0 Kobe Bryant\n", - "4 2008-09 30 LAL 1 4 4 4 0 1 0 27 Kobe Bryant\n", - "5 2009-10 31 LAL Kobe Bryant\n", - "6 2009-10 25 CLE 1 5 6 4 0 2 1 25 Lebron James\n", - "7 2010-11 26 MIA 2 12 10 0 0 4 3 29 Lebron James\n", - "8 2011-12 27 MIA 0 6 7 0 0 4 2 36 Lebron James\n", - "9 2012-13 28 MIA 0 3 5 1 0 4 0 19 Lebron James\n", - "10 2013-14 29 MIA 1 7 7 3 0 1 0 22 Lebron James\n", - "11 2014-15 30 CLE 1 5 7 2 0 4 1 30 Lebron James\n", - "12 2010-11 32 LAL 10 14 3 3 0 4 2 37 Kobe Bryant\n", - "13 2011-12 33 LAL 0 1 1 2 0 1 2 27 Kobe Bryant\n", - "14 2012-13 34 LAL 2 4 8 2 2 1 2 9 Kobe Bryant\n", - "15 2013-14 35 LAL Kobe Bryant\n", - "16 2014-15 36 LAL Kobe Bryant\n", - "17 2015-16 37 LAL 1 6 7 1 0 1 1 10 Kobe Bryant\n", - "18 1997-98 19 LAL 2 6 1 2 0 1 1 18 Kobe Bryant\n", - "19 1999-00 21 LAL 1 1 3 2 0 1 3 15 Kobe Bryant\n", - "20 2000-01 22 LAL 2 4 7 1 0 3 3 19 Kobe Bryant\n", - "21 2001-02 23 LAL 2 5 5 1 0 0 2 31 Kobe Bryant\n", - "22 2002-03 24 LAL 2 7 6 3 2 5 5 22 Kobe Bryant\n", - "23 2003-04 25 LAL 1 4 4 5 1 6 3 20 Kobe Bryant\n", - "24 Season Age Team ORB TRB AST STL BLK TOV PF PTS Player\n", - "25 2004-05 20 CLE 1 8 6 2 0 3 0 13 Lebron James\n", - "26 2005-06 21 CLE 2 6 2 2 0 1 2 29 Lebron James\n", - "27 2006-07 22 CLE 0 6 6 1 0 4 0 28 Lebron James\n", - "28 2007-08 23 CLE 1 8 9 2 2 4 3 27 Lebron James\n", - "29 2008-09 24 CLE 0 5 3 0 0 3 0 20 Lebron James\n", - "30 1992-93 29 CHI 3 4 5 4 0 6 5 30 Michael Jordan\n", - "31 1995-96 32 CHI 1 4 1 1 0 0 1 20 Michael Jordan\n", - "32 1996-97 33 CHI 3 11 11 2 0 3 4 14 Michael Jordan\n", - "33 1997-98 34 CHI 1 6 8 3 0 2 0 23 Michael Jordan\n", - "34 2001-02 38 WAS 0 4 3 2 0 1 1 8 Michael Jordan\n", - "35 2002-03 39 WAS 2 5 2 2 0 2 3 20 Michael Jordan\n", - "36 2015-16 31 CLE 0 4 7 0 0 4 0 13 Lebron James\n", - "37 2016-17 32 CLE 0 3 1 0 0 4 2 23 Lebron James\n", - "38 2017-18 33 CLE 0 10 8 1 0 5 2 29 Lebron James\n", - "39 2018-19 34 LAL 2 8 4 0 2 1 1 19 Lebron James\n", - "40 1984-85 21 CHI 3 6 2 3 1 1 4 7 Michael Jordan\n", - "41 1985-86 22 CHI Michael Jordan\n", - "42 1986-87 23 CHI 0 0 4 2 0 5 2 11 Michael Jordan\n", - "43 1987-88 24 CHI 3 8 3 4 4 2 5 40 Michael Jordan\n", - "44 1988-89 25 CHI 1 2 3 5 0 4 1 28 Michael Jordan\n", - "45 1989-90 26 CHI 1 5 2 5 1 5 1 17 Michael Jordan\n", - "46 1990-91 27 CHI 3 5 5 2 0 10 2 26 Michael Jordan\n", - "47 1991-92 28 CHI 1 1 5 2 0 1 2 18 Michael Jordan" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Creating `allstar_games_stats` table\n", - "(\n", - " table_env.connect(FileSystem().path(\"../datasets/nba/allstar_games_stats.csv\"))\n", - " .with_format(\n", - " OldCsv()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Age\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"ORB\", DataTypes.STRING())\n", - " .field(\"TRB\", DataTypes.STRING())\n", - " .field(\"AST\", DataTypes.STRING())\n", - " .field(\"STL\", DataTypes.STRING())\n", - " .field(\"BLK\", DataTypes.STRING())\n", - " .field(\"TOV\", DataTypes.STRING())\n", - " .field(\"PF\", DataTypes.STRING())\n", - " .field(\"PTS\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " )\n", - " .with_schema(\n", - " Schema()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Age\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"ORB\", DataTypes.STRING())\n", - " .field(\"TRB\", DataTypes.STRING())\n", - " .field(\"AST\", DataTypes.STRING())\n", - " .field(\"STL\", DataTypes.STRING())\n", - " .field(\"BLK\", DataTypes.STRING())\n", - " .field(\"TOV\", DataTypes.STRING())\n", - " .field(\"PF\", DataTypes.STRING())\n", - " .field(\"PTS\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " )\n", - " .create_temporary_table(\"etl_catalog.nba.allstar_games_stats_temp\")\n", - ")\n", - "\n", - "table_env.execute_sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS etl_catalog.nba.allstar_games_stats (Season STRING, Age STRING,\n", - " Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING,\n", - " PF STRING, PTS STRING, Player STRING)\"\"\"\n", - ").wait()\n", - "\n", - "tab = table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\")\n", - "tab.execute_insert(\"etl_catalog.nba.allstar_games_stats\").wait()\n", - "\n", - "# Notice how we view the data on the etl branch via @etl\n", - "table_env.from_path(\"etl_catalog.nba.`allstar_games_stats@etl`\").to_pandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can verify that the new table isn't on the `main` branch but is present on the etl branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n", - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.allstar_games_stats\n", - "\tnba.new_totals_stats\n", - "\tnba.salaries\n", - "\n" - ] - } - ], - "source": [ - "# We should see `allstar_games_stats` and the `new_totals_stats` on the `etl` branch\n", - "!nessie content list --ref etl" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we are happy with the data we can again merge it into `main`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie merge etl -b main --force" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Now lets verify that the changes exist on the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.new_totals_stats\n", - "\tnba.allstar_games_stats\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33m* main 720543fa3a9579d0bfee11e07f383d86468eb4d73dc207e5bd6ef7f76b000930\n", - "\u001b[0m etl c962d80b04ee619a6a0670cb5f664d948c86f6ebf66435027c5abe761e920c9e\n", - " dev f48b93594ddead3a7616a271d657f0fff97cd0c4c04d4a579fa165aa96a69908\n", - "\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:19,196 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,257 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,260 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,263 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,265 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,268 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,270 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,273 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n" - ] - } - ], - "source": [ - "table_count = (\n", - " table_env.from_path(\"main_catalog.nba.allstar_games_stats\").select(\"Season.count\").to_pandas().values[0][0]\n", - ")\n", - "csv_count = (\n", - " table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - ")\n", - "assert table_count == csv_count" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create `experiment` branch\n", - "--------------------------------\n", - "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", - "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", - "and carry out our experiment, which could consist of the following steps:\n", - "- drop `totals_stats` table\n", - "- add data to `salaries` table\n", - "- compare `experiment` and `main` tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "create_ref_catalog(\"experiment\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Drop the `new_totals_stats` table on the `experiment` branch\n", - "table_env.execute_sql(\"DROP TABLE experiment_catalog.nba.new_totals_stats\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:20,258 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n" - ] - } - ], - "source": [ - "# add some salaries for Dirk Nowitzki\n", - "table_env.execute_sql(\n", - " \"\"\"INSERT INTO experiment_catalog.nba.salaries VALUES\n", - " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n", - " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n", - " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n", - " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", - ").wait()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.allstar_games_stats\n", - "\n" - ] - } - ], - "source": [ - "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `new_totals_stats`)\n", - "!nessie content list --ref experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.new_totals_stats\n", - "\tnba.allstar_games_stats\n", - "\n" - ] - } - ], - "source": [ - "# `main` hasn't changed been changed and still has the `new_totals_stats` table\n", - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n", - "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EXPR$0
059
\n", - "
" - ], - "text/plain": [ - " EXPR$0\n", - "0 59" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table_env.from_path(\"main_catalog.nba.`salaries@experiment`\").select(lit(1).count).to_pandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n", - "to the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EXPR$0
055
\n", - "
" - ], - "text/plain": [ - " EXPR$0\n", - "0 55" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table_env.from_path(\"main_catalog.nba.`salaries@main`\").select(lit(1).count).to_pandas()" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nessie Iceberg/Flink SQL Demo with NBA Dataset\n", + "============================\n", + "This demo showcases how to use Nessie Python API along with Flink from Iceberg\n", + "\n", + "Initialize PyFlink\n", + "----------------------------------------------\n", + "To get started, we will first have to do a few setup steps that give us everything we need\n", + "to get started with Nessie. In case you're interested in the detailed setup steps for Flink, you can check out the [docs](https://projectnessie.org/tools/iceberg/flink/)\n", + "\n", + "The Binder server has downloaded flink and some data for us as well as started a Nessie server in the background. All we have to do is start Flink\n", + "\n", + "The below cell starts a local Flink session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import os\n", + "from pyflink.datastream import StreamExecutionEnvironment\n", + "from pyflink.table import StreamTableEnvironment\n", + "from pyflink.table.expressions import lit\n", + "from pynessie import init\n", + "\n", + "# where we will store our data\n", + "warehouse = os.path.join(os.getcwd(), \"flink-warehouse\")\n", + "# this was downloaded when Binder started, its available on maven central\n", + "iceberg_flink_runtime_jar = os.path.join(os.getcwd(), \"../iceberg-flink-runtime-1.17-1.4.2.jar\")\n", + "assert os.path.exists(iceberg_flink_runtime_jar)\n", + "\n", + "env = StreamExecutionEnvironment.get_execution_environment()\n", + "env.add_jars(\"file://{}\".format(iceberg_flink_runtime_jar))\n", + "table_env = StreamTableEnvironment.create(env)\n", + "\n", + "nessie_client = init()\n", + "\n", + "\n", + "def create_ref_catalog(ref):\n", + " \"\"\"\n", + " Create a flink catalog that is tied to a specific ref.\n", + "\n", + " In order to create the catalog we have to first create the branch\n", + " \"\"\"\n", + " default_branch = nessie_client.get_default_branch()\n", + " if ref != default_branch:\n", + " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n", + " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n", + " # The important args below are:\n", + " # type - tell Flink to use Iceberg as the catalog\n", + " # catalog-impl - which Iceberg catalog to use, in this case we want Nessie\n", + " # uri - the location of the nessie server.\n", + " # ref - the Nessie ref/branch we want to use (defaults to main)\n", + " # warehouse - the location this catalog should store its data\n", + " table_env.execute_sql(\n", + " f\"\"\"CREATE CATALOG {ref}_catalog WITH (\n", + " 'type'='iceberg',\n", + " 'catalog-impl'='org.apache.iceberg.nessie.NessieCatalog',\n", + " 'uri'='http://localhost:19120/api/v1',\n", + " 'ref'='{ref}',\n", + " 'warehouse' = '{warehouse}')\"\"\"\n", + " )\n", + "\n", + "\n", + "create_ref_catalog(nessie_client.get_default_branch())\n", + "print(\"\\n\\n\\nFlink running\\n\\n\\n\")\n", + "\n", + "# Create the 'nba' namespace.\n", + "table_env.execute_sql(\"CREATE DATABASE main_catalog.nba\").wait()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Solving Data Engineering problems with Nessie\n", + "============================\n", + "\n", + "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", + "\n", + "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set up Nessie branches (via Nessie CLI)\n", + "----------------------------\n", + "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", + "\n", + "- Create a new branch named `dev`\n", + "- List all branches\n", + "\n", + "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create_ref_catalog(\"dev\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", + "\n", + "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create tables under dev branch\n", + "-------------------------------------\n", + "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", + "\n", + "We create two tables under the `dev` branch:\n", + "- `salaries`\n", + "- `totals_stats`\n", + "\n", + "These tables list the salaries per player per year and their stats per year.\n", + "\n", + "To create the data we:\n", + "\n", + "1. switch our branch context to dev\n", + "2. create the table\n", + "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the dataset\n", + "from pyflink.table import DataTypes, Schema, TableDescriptor\n", + "from pyflink.table.expressions import col\n", + "\n", + "# Creating `salaries` table\n", + "(\n", + " table_env.create_temporary_table(\n", + " \"dev_catalog.nba.salaries_temp\",\n", + " TableDescriptor.for_connector(\"filesystem\")\n", + " .schema(\n", + " Schema.new_builder()\n", + " .column(\"Season\", DataTypes.STRING())\n", + " .column(\"Team\", DataTypes.STRING())\n", + " .column(\"Salary\", DataTypes.STRING())\n", + " .column(\"Player\", DataTypes.STRING())\n", + " .build()\n", + " )\n", + " .option(\"path\", \"../datasets/nba/salaries.csv\")\n", + " .format(\"csv\")\n", + " .build(),\n", + " )\n", + ")\n", + "\n", + "table_env.execute_sql(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.`salaries@dev`\n", + " (Season STRING, Team STRING, Salary STRING, Player STRING)\"\"\"\n", + ").wait()\n", + "\n", + "tab = table_env.from_path(\"dev_catalog.nba.salaries_temp\")\n", + "tab.execute_insert(\"dev_catalog.nba.`salaries@dev`\").wait()\n", + "\n", + "# Creating `totals_stats` table\n", + "(\n", + " table_env.create_temporary_table(\n", + " \"dev_catalog.nba.totals_stats_temp\",\n", + " TableDescriptor.for_connector(\"filesystem\")\n", + " .schema(\n", + " Schema.new_builder()\n", + " .column(\"Season\", DataTypes.STRING())\n", + " .column(\"Age\", DataTypes.STRING())\n", + " .column(\"Team\", DataTypes.STRING())\n", + " .column(\"ORB\", DataTypes.STRING())\n", + " .column(\"DRB\", DataTypes.STRING())\n", + " .column(\"TRB\", DataTypes.STRING())\n", + " .column(\"AST\", DataTypes.STRING())\n", + " .column(\"STL\", DataTypes.STRING())\n", + " .column(\"BLK\", DataTypes.STRING())\n", + " .column(\"TOV\", DataTypes.STRING())\n", + " .column(\"PTS\", DataTypes.STRING())\n", + " .column(\"Player\", DataTypes.STRING())\n", + " .column(\"RSorPO\", DataTypes.STRING())\n", + " .build()\n", + " )\n", + " .option(\"path\", \"../datasets/nba/totals_stats.csv\")\n", + " .format(\"csv\")\n", + " .build(),\n", + " )\n", + ")\n", + "\n", + "table_env.execute_sql(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.`totals_stats@dev` (Season STRING, Age STRING, Team STRING,\n", + " ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING, PTS STRING,\n", + " Player STRING, RSorPO STRING)\"\"\"\n", + ").wait()\n", + "\n", + "tab = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\")\n", + "tab.execute_insert(\"dev_catalog.nba.`totals_stats@dev`\").wait()\n", + "\n", + "salaries = table_env.from_path(\"dev_catalog.nba.`salaries@dev`\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + "totals_stats = (\n", + " table_env.from_path(\"dev_catalog.nba.`totals_stats@dev`\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + ")\n", + "print(f\"\\n\\n\\nAdded {salaries} rows to the salaries table and {totals_stats} rows to the totals_stats table.\\n\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by the catalog." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "table_count = table_env.from_path(\"dev_catalog.nba.`salaries@dev`\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + "assert table_count == csv_count\n", + "print(table_count)\n", + "\n", + "table_count = (\n", + " table_env.from_path(\"dev_catalog.nba.`totals_stats@dev`\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + ")\n", + "csv_count = (\n", + " table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + ")\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check generated tables\n", + "----------------------------\n", + "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", + "let's verify that the `main` branch was not altered by our changes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "And on the `dev` branch we expect to see two tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie content list --ref dev" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can also verify that the `dev` and `main` branches point to different commits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dev promotion into main\n", + "-----------------------\n", + "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", + "We merge `dev` into `main` via the command line `merge` command.\n", + "Both branches should be at the same revision after merging/promotion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie merge dev -b main --force" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can verify that the `main` branch now contains the expected tables and row counts.\n", + "\n", + "The tables are now on `main` and ready for consumption by our blog authors and analysts!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" } - ], - "metadata": { - "kernelspec": { - "display_name": "flink-demo", - "language": "python", - "name": "flink-demo" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" + }, + "outputs": [], + "source": [ + "table_count = table_env.from_path(\"main_catalog.nba.salaries\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + "assert table_count == csv_count\n", + "\n", + "table_count = table_env.from_path(\"main_catalog.nba.totals_stats\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + "csv_count = (\n", + " table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + ")\n", + "assert table_count == csv_count" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Perform regular ETL on the new tables\n", + "-------------------\n", + "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", + "\n", + "1. Update the salaries table to add new data\n", + "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", + "3. We create a new table to hold information about the players appearances in all star games\n", + "\n", + "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create_ref_catalog(\"etl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# add some salaries for Kevin Durant\n", + "table_env.execute_sql(\n", + " \"\"\"INSERT INTO etl_catalog.nba.salaries\n", + " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n", + " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n", + " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n", + " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n", + ").wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Rename the table `totals_stats` to `new_totals_stats`\n", + "table_env.execute_sql(\n", + " \"ALTER TABLE etl_catalog.nba.`totals_stats@etl` RENAME TO etl_catalog.nba.new_totals_stats\"\n", + ").wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating `allstar_games_stats` table\n", + "(\n", + " table_env.create_temporary_table(\n", + " \"etl_catalog.nba.allstar_games_stats_temp\",\n", + " TableDescriptor.for_connector(\"filesystem\")\n", + " .schema(\n", + " Schema.new_builder()\n", + " .column(\"Season\", DataTypes.STRING())\n", + " .column(\"Age\", DataTypes.STRING())\n", + " .column(\"Team\", DataTypes.STRING())\n", + " .column(\"ORB\", DataTypes.STRING())\n", + " .column(\"TRB\", DataTypes.STRING())\n", + " .column(\"AST\", DataTypes.STRING())\n", + " .column(\"STL\", DataTypes.STRING())\n", + " .column(\"BLK\", DataTypes.STRING())\n", + " .column(\"TOV\", DataTypes.STRING())\n", + " .column(\"PF\", DataTypes.STRING())\n", + " .column(\"PTS\", DataTypes.STRING())\n", + " .column(\"Player\", DataTypes.STRING())\n", + " .build()\n", + " )\n", + " .option(\"path\", \"../datasets/nba/allstar_games_stats.csv\")\n", + " .format(\"csv\")\n", + " .build(),\n", + " )\n", + ")\n", + "\n", + "table_env.execute_sql(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS etl_catalog.nba.`allstar_games_stats@etl` (Season STRING, Age STRING,\n", + " Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING,\n", + " PF STRING, PTS STRING, Player STRING)\"\"\"\n", + ").wait()\n", + "\n", + "tab = table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\")\n", + "tab.execute_insert(\"etl_catalog.nba.`allstar_games_stats@etl`\").wait()\n", + "\n", + "# Notice how we view the data on the etl branch via @etl\n", + "table_env.from_path(\"etl_catalog.nba.`allstar_games_stats@etl`\").to_pandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can verify that the new table isn't on the `main` branch but is present on the etl branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n", + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We should see `allstar_games_stats` and the `new_totals_stats` on the `etl` branch\n", + "!nessie content list --ref etl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we are happy with the data we can again merge it into `main`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie merge etl -b main --force" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Now lets verify that the changes exist on the `main` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "table_count = (\n", + " table_env.from_path(\"main_catalog.nba.allstar_games_stats\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + ")\n", + "csv_count = (\n", + " table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n", + ")\n", + "assert table_count == csv_count" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create `experiment` branch\n", + "--------------------------------\n", + "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", + "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", + "and carry out our experiment, which could consist of the following steps:\n", + "- drop `totals_stats` table\n", + "- add data to `salaries` table\n", + "- compare `experiment` and `main` tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create_ref_catalog(\"experiment\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the `new_totals_stats` table on the `experiment` branch\n", + "table_env.execute_sql(\"DROP TABLE experiment_catalog.nba.`new_totals_stats@etl`\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add some salaries for Dirk Nowitzki\n", + "table_env.execute_sql(\n", + " \"\"\"INSERT INTO experiment_catalog.nba.salaries VALUES\n", + " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n", + " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n", + " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n", + " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", + ").wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `new_totals_stats`)\n", + "!nessie content list --ref experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# `main` hasn't changed been changed and still has the `new_totals_stats` table\n", + "!nessie content list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n", + "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table_env.from_path(\"main_catalog.nba.`salaries@experiment`\").select(lit(1).count).to_pandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n", + "to the `main` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" } + }, + "outputs": [], + "source": [ + "table_env.from_path(\"main_catalog.nba.`salaries@main`\").select(lit(1).count).to_pandas()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "flink-demo", + "language": "python", + "name": "flink-demo" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/nessie-iceberg-hive-demo-nba.ipynb b/notebooks/nessie-iceberg-hive-demo-nba.ipynb index 8dd639f8..e0680cb2 100644 --- a/notebooks/nessie-iceberg-hive-demo-nba.ipynb +++ b/notebooks/nessie-iceberg-hive-demo-nba.ipynb @@ -1,1084 +1,821 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Nessie Iceberg/Hive SQL Demo with NBA Dataset\n", - "============================\n", - "This demo showcases how to use Nessie Python API along with Hive from Iceberg\n", - "\n", - "Initialize PyHive\n", - "----------------------------------------------\n", - "To get started, we will first have to do a few setup steps that give us everything we need\n", - "to get started with Nessie. In case you're interested in the detailed setup steps for Hive, you can check out the [docs](https://projectnessie.org/tools/iceberg/hive/)\n", - "\n", - "The Binder server has downloaded Hive, Hadoop and some data for us as well as started a Nessie server in the background. All we have to do is to connect to Hive session.\n", - "\n", - "The below cell starts a local Hive session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "Hive running\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "import os\n", - "from pyhive import hive\n", - "from pynessie import init\n", - "\n", - "# where we will store our data\n", - "warehouse = \"file://\" + os.path.join(os.getcwd(), \"nessie_warehouse\")\n", - "\n", - "# where our datasets are located\n", - "datasets_path = \"file://\" + os.path.join(os.path.dirname(os.getcwd()), \"datasets\")\n", - "\n", - "nessie_client = init()\n", - "\n", - "\n", - "def create_ref_catalog(ref):\n", - " \"\"\"\n", - " Create a branch and switch the current ref to the created branch\n", - " \"\"\"\n", - " default_branch = nessie_client.get_default_branch()\n", - " if ref != default_branch:\n", - " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n", - " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n", - " return switch_ref_catalog(ref)\n", - "\n", - "\n", - "def switch_ref_catalog(ref):\n", - " \"\"\"\n", - " Switch a branch. When we switch the branch via Hive, we will need to reconnect to Hive\n", - " \"\"\"\n", - " # The important args below are:\n", - " # catalog-impl: which Iceberg catalog to use, in this case we want NessieCatalog\n", - " # uri: the location of the nessie server.\n", - " # ref: the Nessie ref/branch we want to use (defaults to main)\n", - " # warehouse: the location this catalog should store its data\n", - " return hive.connect(\n", - " \"localhost\",\n", - " configuration={\n", - " \"iceberg.catalog.dev_catalog.catalog-impl\": \"org.apache.iceberg.nessie.NessieCatalog\",\n", - " \"iceberg.catalog.dev_catalog.uri\": \"http://localhost:19120/api/v1\",\n", - " \"iceberg.catalog.dev_catalog.ref\": ref,\n", - " \"iceberg.catalog.dev_catalog.warehouse\": warehouse,\n", - " },\n", - " ).cursor()\n", - "\n", - "\n", - "print(\"\\n\\nHive running\\n\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Solving Data Engineering problems with Nessie\n", - "============================\n", - "\n", - "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", - "\n", - "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set up Nessie branches (via Nessie CLI)\n", - "----------------------------\n", - "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", - "\n", - "- Create a new branch named `dev`\n", - "- List all branches\n", - "\n", - "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "current_ref = create_ref_catalog(\"dev\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", - "\n", - "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[0m\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tables under dev branch\n", - "-------------------------------------\n", - "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", - "\n", - "We create two tables under the `dev` branch:\n", - "- `salaries`\n", - "- `totals_stats`\n", - "\n", - "These tables list the salaries per player per year and their stats per year.\n", - "\n", - "To create the data we:\n", - "\n", - "1. switch our branch context to dev\n", - "2. create the table\n", - "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Created schema nba\n", - "\n", - "\n", - "Creating tables nba.salaries and nba.totals_stats....\n", - "\n", - "\n", - "Created and inserted data into table nba.salaries from dataset salaries\n", - "\n", - "\n", - "Created and inserted data into table nba.totals_stats from dataset totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# Creating our demo schema\n", - "current_ref.execute(\"CREATE SCHEMA IF NOT EXISTS nba\")\n", - "\n", - "print(\"\\nCreated schema nba\\n\")\n", - "\n", - "\n", - "print(\"\\nCreating tables nba.salaries and nba.totals_stats....\\n\")\n", - "\n", - "# Creating `salaries` table\n", - "\n", - "current_ref.execute(\n", - " f\"\"\"CREATE TABLE IF NOT EXISTS nba.salaries (Season STRING,\n", - " Team STRING, Salary STRING, Player STRING)\n", - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", - " LOCATION '{warehouse}/nba/salaries'\n", - " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", - " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", - ")\n", - "\n", - "## We create a temporary table to load data into our target table since\n", - "## is not possible to load data directly from CSV into non-native table.\n", - "current_ref.execute(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS nba.salaries_temp (Season STRING,\n", - " Team STRING, Salary STRING, Player STRING)\n", - " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", - ")\n", - "\n", - "current_ref.execute(f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/salaries.csv\" OVERWRITE INTO TABLE nba.salaries_temp')\n", - "current_ref.execute(\"INSERT OVERWRITE TABLE nba.salaries SELECT * FROM nba.salaries_temp\")\n", - "\n", - "print(\"\\nCreated and inserted data into table nba.salaries from dataset salaries\\n\")\n", - "\n", - "\n", - "# Creating `totals_stats` table\n", - "\n", - "current_ref.execute(\n", - " f\"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING,\n", - " DRB STRING, TRB STRING, AST STRING, STL STRING,\n", - " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", - " LOCATION '{warehouse}/nba/totals_stats'\n", - " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", - " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", - ")\n", - "\n", - "## We create a temporary table to load data into our target table since\n", - "## is not possible to load data directly from CSV into non-native table.\n", - "current_ref.execute(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats_temp (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING,\n", - " DRB STRING, TRB STRING, AST STRING, STL STRING,\n", - " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", - " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", - ")\n", - "\n", - "current_ref.execute(\n", - " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/totals_stats.csv\" OVERWRITE INTO TABLE nba.totals_stats_temp'\n", - ")\n", - "current_ref.execute(\"INSERT OVERWRITE TABLE nba.totals_stats SELECT * FROM nba.totals_stats_temp\")\n", - "\n", - "print(\"\\nCreated and inserted data into table nba.totals_stats from dataset totals_stats\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Now we count the rows in our tables to ensure they are the same number as the csv files. Unlike Spark and Flink demos, we can't use the notation of `table@branch` (see the github issue [here](https://github.com/projectnessie/nessie/issues/1985). Therefore, we just set Nessie ref settings through Hive setting `SET iceberg.catalog.{catalog}.ref = {branch}` whenever we want to work on a specific branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.salaries\n", - "\n", - "51\n", - "\n", - "Counting rows in nba.totals_stats\n", - "\n", - "93\n" - ] - } - ], - "source": [ - "# We make sure we are still in dev branch\n", - "current_ref = switch_ref_catalog(\"dev\")\n", - "\n", - "print(\"\\nCounting rows in nba.salaries\\n\")\n", - "\n", - "# We count now\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "print(\"\\nCounting rows in nba.totals_stats\\n\")\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check generated tables\n", - "----------------------------\n", - "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", - "let's verify that the `main` branch was not altered by our changes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "And on the `dev` branch we expect to see two tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.totals_stats\n", - "\tnba.salaries\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list --ref dev" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can also verify that the `dev` and `main` branches point to different commits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dev d1ea40ccb14fd8365828bf740d73e8ed9d04ce5d9739020d00d7ffa5937cf9d3\n", - "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[0m\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dev promotion into main\n", - "-----------------------\n", - "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", - "We merge `dev` into `main` via the command line `merge` command.\n", - "Both branches should be at the same revision after merging/promotion." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie merge dev -b main --force" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can verify that the `main` branch now contains the expected tables and row counts.\n", - "\n", - "The tables are now on `main` and ready for consumption by our blog authors and analysts!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33m* main 330f993ac08aceb2252702611f6bf1a92f49ac2e3fc709b250a017ba4a9cded6\n", - "\u001b[0m dev d1ea40ccb14fd8365828bf740d73e8ed9d04ce5d9739020d00d7ffa5937cf9d3\n", - "\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.salaries\n", - "\n", - "51\n", - "\n", - "Counting rows in nba.totals_stats\n", - "\n", - "93\n" - ] - } - ], - "source": [ - "# We switch to main branch\n", - "current_ref = switch_ref_catalog(\"main\")\n", - "\n", - "print(\"\\nCounting rows in nba.salaries\\n\")\n", - "\n", - "# We count now\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "print(\"\\nCounting rows in nba.totals_stats\\n\")\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Perform regular ETL on the new tables\n", - "-------------------\n", - "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", - "\n", - "1. Update the salaries table to add new data\n", - "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", - "3. We create a new table to hold information about the players appearances in all star games\n", - "\n", - "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "current_ref = create_ref_catalog(\"etl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# add some salaries for Kevin Durant\n", - "current_ref.execute(\n", - " \"\"\"INSERT INTO nba.salaries\n", - " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n", - " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n", - " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n", - " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Creating table nba.allstar_games_stats\n", - "\n", - "\n", - "Created and inserted data into table nba.allstar_table_temp from dataset allstar_games_stats\n", - "\n", - "\n", - "Counting rows in nba.allstar_games_stats\n", - "\n", - "48\n" - ] - } - ], - "source": [ - "print(\"\\nCreating table nba.allstar_games_stats\\n\")\n", - "\n", - "# Creating `allstar_games_stats` table\n", - "current_ref.execute(\n", - " f\"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_games_stats (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING,\n", - " TRB STRING, AST STRING, STL STRING, BLK STRING,\n", - " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", - " LOCATION '{warehouse}/nba/allstar_games_stats'\n", - " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", - " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", - ")\n", - "\n", - "## We create a temporary table to load data into our target table since\n", - "## is not possible to load data directly from CSV into non-native table.\n", - "current_ref.execute(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_table_temp (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING,\n", - " AST STRING, STL STRING, BLK STRING,\n", - " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", - " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", - ")\n", - "\n", - "current_ref.execute(\n", - " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/allstar_games_stats.csv\" OVERWRITE INTO TABLE nba.allstar_table_temp'\n", - ")\n", - "current_ref.execute(\"INSERT OVERWRITE TABLE nba.allstar_games_stats SELECT * FROM nba.allstar_table_temp\")\n", - "\n", - "print(\"\\nCreated and inserted data into table nba.allstar_table_temp from dataset allstar_games_stats\\n\")\n", - "\n", - "\n", - "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n", - "\n", - "# Since we can't do 'table@branch'\n", - "current_ref = switch_ref_catalog(\"etl\")\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n", - "print(current_ref.fetchone()[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can verify that the new table isn't on the `main` branch but is present on the etl branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n", - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.allstar_games_stats\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# We should see the new `allstar_games_stats` table on the `etl` branch\n", - "!nessie content list --ref etl" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we are happy with the data we can again merge it into `main`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie merge etl -b main --force" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Now lets verify that the changes exist on the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.allstar_games_stats\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33m* main 11ed5b46713231a5fb85f31083d47dbf6bfa1df5839bebbac08301cda8afe22f\n", - "\u001b[0m etl a3e06ba7595dfdb8bc67b0d6825587d2858cfe2b013bf1b95c5a1471578c4af3\n", - " dev d1ea40ccb14fd8365828bf740d73e8ed9d04ce5d9739020d00d7ffa5937cf9d3\n", - "\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.allstar_games_stats\n", - "\n", - "48\n" - ] - } - ], - "source": [ - "# We switch to the main branch\n", - "current_ref = switch_ref_catalog(\"main\")\n", - "\n", - "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n", - "\n", - "# We count now\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_table_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create `experiment` branch\n", - "--------------------------------\n", - "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", - "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", - "and carry out our experiment, which could consist of the following steps:\n", - "- drop `totals_stats` table\n", - "- add data to `salaries` table\n", - "- compare `experiment` and `main` tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "current_ref = create_ref_catalog(\"experiment\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Drop the `totals_stats` table on the `experiment` branch\n", - "current_ref.execute(\"DROP TABLE nba.totals_stats\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add some salaries for Dirk Nowitzki\n", - "current_ref.execute(\n", - " \"\"\"INSERT INTO nba.salaries VALUES\n", - " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n", - " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n", - " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n", - " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.allstar_games_stats\n", - "\n" - ] - } - ], - "source": [ - "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `totals_stats`)\n", - "!nessie content list --ref experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.allstar_games_stats\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# `main` hasn't been changed and still has the `totals_stats` table\n", - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the contents of the `salaries` table on the `experiment` branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.salaries\n", - "\n", - "59\n" - ] - } - ], - "source": [ - "current_ref = switch_ref_catalog(\"experiment\")\n", - "\n", - "print(\"\\nCounting rows in nba.salaries\\n\")\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", - "print(current_ref.fetchone()[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and compare to the contents of the `salaries` table on the `main` branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.salaries\n", - "\n", - "56\n" - ] - } - ], - "source": [ - "current_ref = switch_ref_catalog(\"main\")\n", - "\n", - "# the following INSERT is a workaround for https://github.com/apache/iceberg/pull/4509 until iceberg 0.13.2 is released\n", - "# add a single salary for Dirk Nowitzki (so we expect 3 less total rows)\n", - "current_ref.execute(\n", - " \"\"\"INSERT INTO nba.salaries VALUES\n", - " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", - ")\n", - "\n", - "print(\"\\nCounting rows in nba.salaries\\n\")\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", - "print(current_ref.fetchone()[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And finally lets clean up after ourselves" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "!nessie branch --delete dev\n", - "!nessie branch --delete etl\n", - "!nessie branch --delete experiment" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "Nessie Iceberg/Hive SQL Demo with NBA Dataset\n", + "============================\n", + "This demo showcases how to use Nessie Python API along with Hive from Iceberg\n", + "\n", + "Initialize PyHive\n", + "----------------------------------------------\n", + "To get started, we will first have to do a few setup steps that give us everything we need\n", + "to get started with Nessie. In case you're interested in the detailed setup steps for Hive, you can check out the [docs](https://projectnessie.org/tools/iceberg/hive/)\n", + "\n", + "The Binder server has downloaded Hive, Hadoop and some data for us as well as started a Nessie server in the background. All we have to do is to connect to Hive session.\n", + "\n", + "The below cell starts a local Hive session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import requests\n", + "from pyhive import hive\n", + "from pynessie import init\n", + "\n", + "# where we will store our data\n", + "warehouse = \"file://\" + os.path.join(os.getcwd(), \"nessie_warehouse\")\n", + "\n", + "# where our datasets are located\n", + "datasets_path = \"file://\" + os.path.join(os.path.dirname(os.getcwd()), \"datasets\")\n", + "\n", + "nessie_client = init()\n", + "\n", + "\n", + "def create_namespace(ref: str, namespace: list[str]):\n", + " hash = nessie_client.get_reference(ref).hash_\n", + " # pynessie client has currently no code to create namespace, issue a plain REST request.\n", + " response = requests.post(\n", + " url=f\"http://127.0.0.1:19120/api/v2/trees/{ref}@{hash}/history/commit\",\n", + " headers={\"Accept\": \"application/json\", \"Content-Type\": \"application/json\"},\n", + " json={\n", + " \"commitMeta\": {\"message\": \"Create namespace nba\"},\n", + " \"operations\": [{\"type\": \"PUT\", \"key\": {\"elements\": namespace}, \"content\": {\"type\": \"NAMESPACE\"}}],\n", + " },\n", + " )\n", + " if response.status_code != 200:\n", + " raise Exception(f\"Could not create namespace: HTTP {response.status_code} {response.reason}: {response.json()}\")\n", + "\n", + "\n", + "def create_ref_catalog(ref: str):\n", + " \"\"\"\n", + " Create a branch and switch the current ref to the created branch\n", + " \"\"\"\n", + " default_branch = nessie_client.get_default_branch()\n", + " if ref != default_branch:\n", + " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n", + " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n", + " return switch_ref_catalog(ref)\n", + "\n", + "\n", + "def switch_ref_catalog(ref: str):\n", + " \"\"\"\n", + " Switch a branch. When we switch the branch via Hive, we will need to reconnect to Hive\n", + " \"\"\"\n", + " # The important args below are:\n", + " # catalog-impl: which Iceberg catalog to use, in this case we want NessieCatalog\n", + " # uri: the location of the nessie server.\n", + " # ref: the Nessie ref/branch we want to use (defaults to main)\n", + " # warehouse: the location this catalog should store its data\n", + " return hive.connect(\n", + " \"localhost\",\n", + " configuration={\n", + " \"iceberg.catalog.dev_catalog.catalog-impl\": \"org.apache.iceberg.nessie.NessieCatalog\",\n", + " \"iceberg.catalog.dev_catalog.uri\": \"http://localhost:19120/api/v1\",\n", + " \"iceberg.catalog.dev_catalog.ref\": ref,\n", + " \"iceberg.catalog.dev_catalog.warehouse\": warehouse,\n", + " },\n", + " ).cursor()\n", + "\n", + "\n", + "create_namespace(\"main\", [\"nba\"])\n", + "\n", + "\n", + "print(\"\\n\\nHive running\\n\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Solving Data Engineering problems with Nessie\n", + "============================\n", + "\n", + "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", + "\n", + "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set up Nessie branches (via Nessie CLI)\n", + "----------------------------\n", + "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", + "\n", + "- Create a new branch named `dev`\n", + "- List all branches\n", + "\n", + "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_ref = create_ref_catalog(\"dev\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" + }, + "source": [ + "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", + "\n", + "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" } + }, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create tables under dev branch\n", + "-------------------------------------\n", + "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", + "\n", + "We create two tables under the `dev` branch:\n", + "- `salaries`\n", + "- `totals_stats`\n", + "\n", + "These tables list the salaries per player per year and their stats per year.\n", + "\n", + "To create the data we:\n", + "\n", + "1. switch our branch context to dev\n", + "2. create the table\n", + "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating our demo schema\n", + "current_ref.execute(\"CREATE SCHEMA IF NOT EXISTS nba\")\n", + "\n", + "print(\"\\nCreated schema nba\\n\")\n", + "\n", + "\n", + "print(\"\\nCreating tables nba.salaries and nba.totals_stats....\\n\")\n", + "\n", + "# Creating `salaries` table\n", + "\n", + "current_ref.execute(\n", + " f\"\"\"CREATE TABLE IF NOT EXISTS nba.salaries (Season STRING,\n", + " Team STRING, Salary STRING, Player STRING)\n", + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", + " LOCATION '{warehouse}/nba/salaries'\n", + " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", + " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", + ")\n", + "\n", + "## We create a temporary table to load data into our target table since\n", + "## is not possible to load data directly from CSV into non-native table.\n", + "current_ref.execute(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS nba.salaries_temp (Season STRING,\n", + " Team STRING, Salary STRING, Player STRING)\n", + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", + ")\n", + "\n", + "current_ref.execute(f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/salaries.csv\" OVERWRITE INTO TABLE nba.salaries_temp')\n", + "current_ref.execute(\"INSERT OVERWRITE TABLE nba.salaries SELECT * FROM nba.salaries_temp\")\n", + "\n", + "print(\"\\nCreated and inserted data into table nba.salaries from dataset salaries\\n\")\n", + "\n", + "\n", + "# Creating `totals_stats` table\n", + "\n", + "current_ref.execute(\n", + " f\"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING,\n", + " DRB STRING, TRB STRING, AST STRING, STL STRING,\n", + " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", + " LOCATION '{warehouse}/nba/totals_stats'\n", + " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", + " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", + ")\n", + "\n", + "## We create a temporary table to load data into our target table since\n", + "## is not possible to load data directly from CSV into non-native table.\n", + "current_ref.execute(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats_temp (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING,\n", + " DRB STRING, TRB STRING, AST STRING, STL STRING,\n", + " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", + ")\n", + "\n", + "current_ref.execute(\n", + " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/totals_stats.csv\" OVERWRITE INTO TABLE nba.totals_stats_temp'\n", + ")\n", + "current_ref.execute(\"INSERT OVERWRITE TABLE nba.totals_stats SELECT * FROM nba.totals_stats_temp\")\n", + "\n", + "print(\"\\nCreated and inserted data into table nba.totals_stats from dataset totals_stats\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Now we count the rows in our tables to ensure they are the same number as the csv files. Unlike Spark and Flink demos, we can't use the notation of `table@branch` (see the github issue [here](https://github.com/projectnessie/nessie/issues/1985). Therefore, we just set Nessie ref settings through Hive setting `SET iceberg.catalog.{catalog}.ref = {branch}` whenever we want to work on a specific branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# We make sure we are still in dev branch\n", + "current_ref = switch_ref_catalog(\"dev\")\n", + "\n", + "print(\"\\nCounting rows in nba.salaries\\n\")\n", + "\n", + "# We count now\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)\n", + "\n", + "print(\"\\nCounting rows in nba.totals_stats\\n\")\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check generated tables\n", + "----------------------------\n", + "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", + "let's verify that the `main` branch was not altered by our changes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "And on the `dev` branch we expect to see two tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie content list --ref dev" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can also verify that the `dev` and `main` branches point to different commits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dev promotion into main\n", + "-----------------------\n", + "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", + "We merge `dev` into `main` via the command line `merge` command.\n", + "Both branches should be at the same revision after merging/promotion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie merge dev -b main --force" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can verify that the `main` branch now contains the expected tables and row counts.\n", + "\n", + "The tables are now on `main` and ready for consumption by our blog authors and analysts!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# We switch to main branch\n", + "current_ref = switch_ref_catalog(\"main\")\n", + "\n", + "print(\"\\nCounting rows in nba.salaries\\n\")\n", + "\n", + "# We count now\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)\n", + "\n", + "print(\"\\nCounting rows in nba.totals_stats\\n\")\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Perform regular ETL on the new tables\n", + "-------------------\n", + "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", + "\n", + "1. Update the salaries table to add new data\n", + "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", + "3. We create a new table to hold information about the players appearances in all star games\n", + "\n", + "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_ref = create_ref_catalog(\"etl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# add some salaries for Kevin Durant\n", + "current_ref.execute(\n", + " \"\"\"INSERT INTO nba.salaries\n", + " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n", + " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n", + " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n", + " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\nCreating table nba.allstar_games_stats\\n\")\n", + "\n", + "# Creating `allstar_games_stats` table\n", + "current_ref.execute(\n", + " f\"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_games_stats (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING,\n", + " TRB STRING, AST STRING, STL STRING, BLK STRING,\n", + " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", + " LOCATION '{warehouse}/nba/allstar_games_stats'\n", + " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", + " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", + ")\n", + "\n", + "## We create a temporary table to load data into our target table since\n", + "## is not possible to load data directly from CSV into non-native table.\n", + "current_ref.execute(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_table_temp (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING,\n", + " AST STRING, STL STRING, BLK STRING,\n", + " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", + ")\n", + "\n", + "current_ref.execute(\n", + " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/allstar_games_stats.csv\" OVERWRITE INTO TABLE nba.allstar_table_temp'\n", + ")\n", + "current_ref.execute(\"INSERT OVERWRITE TABLE nba.allstar_games_stats SELECT * FROM nba.allstar_table_temp\")\n", + "\n", + "print(\"\\nCreated and inserted data into table nba.allstar_table_temp from dataset allstar_games_stats\\n\")\n", + "\n", + "\n", + "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n", + "\n", + "# Since we can't do 'table@branch'\n", + "current_ref = switch_ref_catalog(\"etl\")\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n", + "print(current_ref.fetchone()[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can verify that the new table isn't on the `main` branch but is present on the etl branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n", + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We should see the new `allstar_games_stats` table on the `etl` branch\n", + "!nessie content list --ref etl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we are happy with the data we can again merge it into `main`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie merge etl -b main --force" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Now lets verify that the changes exist on the `main` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# We switch to the main branch\n", + "current_ref = switch_ref_catalog(\"main\")\n", + "\n", + "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n", + "\n", + "# We count now\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_table_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create `experiment` branch\n", + "--------------------------------\n", + "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", + "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", + "and carry out our experiment, which could consist of the following steps:\n", + "- drop `totals_stats` table\n", + "- add data to `salaries` table\n", + "- compare `experiment` and `main` tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_ref = create_ref_catalog(\"experiment\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the `totals_stats` table on the `experiment` branch\n", + "current_ref.execute(\"DROP TABLE nba.totals_stats\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add some salaries for Dirk Nowitzki\n", + "current_ref.execute(\n", + " \"\"\"INSERT INTO nba.salaries VALUES\n", + " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n", + " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n", + " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n", + " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `totals_stats`)\n", + "!nessie content list --ref experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# `main` hasn't been changed and still has the `totals_stats` table\n", + "!nessie content list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the contents of the `salaries` table on the `experiment` branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_ref = switch_ref_catalog(\"experiment\")\n", + "\n", + "print(\"\\nCounting rows in nba.salaries\\n\")\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", + "print(current_ref.fetchone()[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and compare to the contents of the `salaries` table on the `main` branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "current_ref = switch_ref_catalog(\"main\")\n", + "\n", + "# the following INSERT is a workaround for https://github.com/apache/iceberg/pull/4509 until iceberg 0.13.2 is released\n", + "# add a single salary for Dirk Nowitzki (so we expect 3 less total rows)\n", + "current_ref.execute(\n", + " \"\"\"INSERT INTO nba.salaries VALUES\n", + " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", + ")\n", + "\n", + "print(\"\\nCounting rows in nba.salaries\\n\")\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", + "print(current_ref.fetchone()[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And finally lets clean up after ourselves" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie branch --delete dev\n", + "!nessie branch --delete etl\n", + "!nessie branch --delete experiment" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/requirements_dev.txt b/notebooks/requirements_dev.txt index 3d705656..fbbb4280 100644 --- a/notebooks/requirements_dev.txt +++ b/notebooks/requirements_dev.txt @@ -15,18 +15,18 @@ # -r requirements.txt assertpy==1.1 +build==0.10.0 bump2version==1.0.1 -build==0.8.0 -ipython==7.34.0 -jupyterlab==3.4.7 +ipython==8.18.0 +jupyterlab==3.6.6 nbstripout==0.6.1 -pip==22.2.2 -pytest==7.1.3 -pytest-mock==3.8.2 -pytest-mypy==0.9.1 +pip==23.3.1 +pytest==7.4.3 +pytest-mock==3.12.0 +pytest-mypy==0.10.3 pytest-runner==6.0.0 testbook[dev]==0.4.2 -tox==3.26.0 +tox==4.11.3 twine==4.0.1 watchdog==2.1.9 -wheel==0.37.1 +wheel==0.41.3 diff --git a/notebooks/requirements_lint.txt b/notebooks/requirements_lint.txt index 57328b6c..6936aebc 100644 --- a/notebooks/requirements_lint.txt +++ b/notebooks/requirements_lint.txt @@ -14,14 +14,13 @@ # limitations under the License. # -r requirements_dev.txt -bandit==1.7.4 -black[jupyter]==22.8.0 -flake8==5.0.4 -flake8-annotations==2.9.1 +bandit==1.7.5 +black[jupyter]==23.11.0 +flake8==6.1.0 +flake8-annotations==3.0.1 flake8-bandit==4.1.1 -flake8-black==0.3.3 -flake8-bugbear==22.9.23 -flake8-docstrings==1.6.0 -flake8-import-order==0.18.1 -pytest-mypy==0.9.1 -safety==2.2.0 +flake8-black==0.3.6 +flake8-bugbear==23.9.16 +flake8-docstrings==1.7.0 +flake8-import-order==0.18.2 +pytest-mypy==0.10.3 diff --git a/notebooks/tests/__init__.py b/notebooks/tests/__init__.py index 4c3decb3..3e363460 100644 --- a/notebooks/tests/__init__.py +++ b/notebooks/tests/__init__.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- # # Copyright (C) 2020 Dremio # @@ -16,14 +17,12 @@ # """Unit tests for demo notebooks.""" import os -import platform import shutil import subprocess # noqa: S404 from contextlib import contextmanager from typing import Iterator from typing import List -from utils import fetch_nessie from utils import fetch_nessie_jar @@ -66,12 +65,5 @@ def start_nessie() -> Iterator[subprocess.Popen]: def _fetch_and_get_nessie_start_command() -> List[str]: - operating_system = platform.system().lower() - - if operating_system == "darwin": - # In Mac case, we use the nessie jar - runner = fetch_nessie_jar() - return ["java", "-jar", runner] - else: - runner = fetch_nessie() - return ["./" + runner] + runner = fetch_nessie_jar() + return ["java", "-jar", runner] diff --git a/notebooks/tests/scripts/start_hive b/notebooks/tests/scripts/start_hive index 7e3f03a6..5d2c50f6 100755 --- a/notebooks/tests/scripts/start_hive +++ b/notebooks/tests/scripts/start_hive @@ -10,6 +10,25 @@ HIVE_VERSION=$(python -c "import utils;print(utils._HIVE_VERSION)") export HADOOP_HOME=$PWD/hadoop-$HADOOP_VERSION +# Check for Java 8 + 11 for tox (also in /docker/binder/start.hive) +if [[ -z ${JAVA8_HOME} || -z ${JAVA11_HOME} || ! -d ${JAVA8_HOME} || ! -d ${JAVA11_HOME} ]] ; then + cat < /dev/stderr + + +============================================================================================================ +Define the JAVA8_HOME and JAVA11_HOME environment variables to point to Java 8 and Java 11 development kits. +============================================================================================================ + +Need Java 8 for Hive server to work. +Java 11 (not newer!) is required for Spark, but also Nessie. + + +! + exit 1 +fi + #Start Hive +echo "Starting Hive for tox, current directory: $(pwd)" +rm -f nohup.out derby.log chmod +x $PWD/../docker/binder/start.hive -nohup $PWD/../docker/binder/start.hive $PWD $PWD/../docker/binder/resources $HIVE_VERSION \ No newline at end of file +nohup $PWD/../docker/binder/start.hive $PWD $PWD/../docker/binder/resources $HIVE_VERSION diff --git a/notebooks/tests/test_nessie_delta_demo_nba.py b/notebooks/tests/test_nessie_delta_demo_nba.py deleted file mode 100644 index e870776e..00000000 --- a/notebooks/tests/test_nessie_delta_demo_nba.py +++ /dev/null @@ -1,100 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (C) 2020 Dremio -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -"""Tests the Nessie + Iceberg + Spark Jupyter Notebook with the NBA dataset.""" -from typing import Iterator - -import pytest -from assertpy import assert_that -from assertpy.assertpy import AssertionBuilder -from testbook import testbook -from testbook.client import TestbookNotebookClient -from utils import fetch_spark - -from . import _find_notebook -from . import _remove_folders -from . import start_nessie - -num_salaries_on_experiment = """count(1) -0 58""" - -num_salaries_on_main = """count(1) -0 54""" - - -@pytest.fixture(scope="module") -def notebook() -> Iterator[TestbookNotebookClient]: - """Pytest fixture to run a notebook.""" - path_to_notebook = _find_notebook("nessie-delta-demo-nba.ipynb") - fetch_spark() - - with start_nessie(): - with testbook(path_to_notebook, timeout=300) as tb: - tb.execute() - yield tb - # Clean all the folders that being created by this test - _remove_folders(["spark-warehouse", "spark_warehouse"]) - - -def _assert_that_notebook( - text: str, notebook: TestbookNotebookClient, count: int = 0 -) -> AssertionBuilder: - for seen, value in enumerate( - n for n, i in enumerate(notebook.cells) if text in i["source"] - ): - if seen == count: - return assert_that(notebook.cell_output_text(value)) - raise Exception(f"Unable to find cell with text: {text}") - - -def test_notebook_output(notebook: TestbookNotebookClient) -> None: - """Runs through the entire notebook and checks the output. - - :param notebook: The notebook to test - :return: - """ - assertion = lambda x: _assert_that_notebook(x, notebook) # NOQA - assertion_counted = lambda x, y: _assert_that_notebook(x, notebook, y) # NOQA - - assertion("findspark.init").contains("Spark Running") - - assertion("CREATE BRANCH dev").contains("Branch").contains("dev") - - assertion("INSERT INTO totals_stats SELECT * FROM stats_table").is_equal_to( - "Empty DataFrame\nColumns: []\nIndex: []" - ) - - assertion_counted("LIST REFERENCES", 1).contains("main").contains("dev").contains( - "Branch" - ) - - assertion_counted( - 'spark.sql("select count(*) from salaries").toPandas()', 2 - ).is_equal_to(num_salaries_on_experiment) - - assertion_counted( - 'spark.sql("select count(*) from salaries").toPandas()', 3 - ).is_equal_to(num_salaries_on_main) - - -def test_dependency_setup(notebook: TestbookNotebookClient) -> None: - """Verifies that dependencies were correctly set up. - - :param notebook: The notebook to test - :return: - """ - spark = notebook.ref("spark") - assert_that(spark).is_not_none() diff --git a/notebooks/tox.ini b/notebooks/tox.ini index 014810dd..49dac650 100644 --- a/notebooks/tox.ini +++ b/notebooks/tox.ini @@ -15,24 +15,24 @@ # [tox] -envlist = py37, format, lint, flink, hive +envlist = py310, format, lint, flink, hive skipsdist = True [gh-actions] python = - 3.7: py37, lint, flink, hive + 3.10: py310, lint, flink, hive [testenv:format] allowlist_externals=bash deps = -r{toxinidir}/requirements_lint.txt commands = - black --target-version py37 tests format_notebooks.py + black --target-version py310 tests ./format_notebooks.py python -m format_notebooks # this formats python code inside the notebooks - bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py37 --line-length 120 --ipynb' + bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py310 --line-length 120 --ipynb' # this formats cell output from single string to list of strings and removes execution metadata - bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose nbstripout --keep-output --strip-empty-cells' + bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose nbstripout --keep-output --drop-empty-cells' python -m format_notebooks [testenv:lint] @@ -40,15 +40,14 @@ allowlist_externals=bash deps = -r{toxinidir}/requirements_lint.txt commands = - flake8 tests format_notebooks.py - bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py37 --line-length 120 --ipynb --check' + flake8 tests ./format_notebooks.py + bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py310 --line-length 120 --ipynb --check' [testenv:flink] setenv = PYTHONPATH = {toxinidir}:{toxinidir}/../docker -passenv = TOXENV CI CODECOV_* +passenv = TOXENV,CI,CODECOV_* deps = - --use-deprecated=legacy-resolver -r{toxinidir}/../docker/binder/requirements_flink.txt -r{toxinidir}/requirements_dev.txt commands = @@ -59,7 +58,7 @@ commands = allowlist_externals=bash setenv = PYTHONPATH = {toxinidir}:{toxinidir}/../docker -passenv = TOXENV CI CODECOV_* JAVA_HOME +passenv = TOXENV,CI,CODECOV_*,JAVA_HOME,JAVA8_HOME,JAVA11_HOME deps = -r{toxinidir}/../docker/binder/requirements.txt -r{toxinidir}/requirements_dev.txt @@ -71,10 +70,10 @@ commands = [testenv] setenv = PYTHONPATH = {toxinidir}:{toxinidir}/../docker -passenv = TOXENV CI CODECOV_* +passenv = TOXENV,CI,CODECOV_* deps = -r{toxinidir}/../docker/binder/requirements.txt -r{toxinidir}/requirements_dev.txt commands = - nbstripout {toxinidir}/nessie-iceberg-demo-nba.ipynb {toxinidir}/nessie-delta-demo-nba.ipynb + nbstripout {toxinidir}/nessie-iceberg-demo-nba.ipynb pytest --basetemp={envtmpdir} -ra tests --ignore tests/test_nessie_iceberg_flink_demo_nba.py --ignore tests/test_nessie_iceberg_hive_demo_nba.py