diff --git a/.github/workflows/demos-docker-build.yaml b/.github/workflows/demos-docker-build.yaml
index d269b0cc..ea5606e0 100644
--- a/.github/workflows/demos-docker-build.yaml
+++ b/.github/workflows/demos-docker-build.yaml
@@ -34,7 +34,7 @@ jobs:
strategy:
max-parallel: 4
matrix:
- python-version: [3.7]
+ python-version: ['3.10']
steps:
- uses: actions/checkout@v3
diff --git a/.github/workflows/notebooks.yaml b/.github/workflows/notebooks.yaml
index 4330c174..6b7434fa 100644
--- a/.github/workflows/notebooks.yaml
+++ b/.github/workflows/notebooks.yaml
@@ -27,6 +27,10 @@ on:
- '**/*.md'
- '.github/renovate.json5'
+concurrency:
+ group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
+ cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
+
jobs:
python:
name: Testing Jupyter Notebooks
@@ -34,12 +38,25 @@ jobs:
strategy:
max-parallel: 4
matrix:
- python-version: [3.7]
+ python-version: ['3.10']
steps:
- uses: actions/checkout@v3
- name: Install system dependencies
run: sudo apt-get install libsasl2-dev libsasl2-modules
+ - name: Set up Java
+ uses: actions/setup-java@v3
+ with:
+ distribution: 'temurin'
+ # Need Java 8 for Hive + 11 for Spark (and Nessie)
+ java-version: |
+ 8
+ 11
+ - name: setup JAVAx_HOME
+ run: |
+ echo "JAVA8_HOME=$JAVA_HOME_8_X64" >> ${GITHUB_ENV}
+ echo "JAVA11_HOME=$JAVA_HOME_11_X64" >> ${GITHUB_ENV}
+ echo "JAVA_HOME=$JAVA_HOME_11_X64" >> ${GITHUB_ENV}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
@@ -50,8 +67,22 @@ jobs:
run: |
python -m pip install --upgrade pip
python -m pip install tox tox-gh-actions build
+ - name: Cache Tools (Hadoop, Hive, Spark)
+ id: cache-tools
+ uses: actions/cache@v3
+ with:
+ path: |
+ notebooks/hadoop-*
+ notebooks/apache-hive-*
+ notebooks/spark-*
+ notebooks/iceberg-*.jar
+ notebooks/nessie-quarkus-*.jar
+ key: tools-cache-${{ hashFiles('docker/utils/__init__.py') }}
+ - name: Check Dockerfile has correct registry
+ run: |
+ grep -q 'FROM ghcr.io/projectnessie/nessie-binder-demos:.*' binder/Dockerfile
- name: Check Dockerfile hash is up-to-date
- if: github.actor != 'dependabot[bot]'
+ if: github.actor != 'renovate'
run: |
bash -ex .github/scripts/modify_dockerfile.sh
changed_hash=$(git status --porcelain binder/Dockerfile)
@@ -63,5 +94,10 @@ jobs:
fi
echo "PASSED: Dockerfile hash is up-to-date!"
- name: Test Notebooks with Tox
- working-directory: notebooks/tests
+ working-directory: notebooks/
run: tox
+ - name: Dump Hive output on error
+ working-directory: notebooks/
+ if: failure()
+ run: |
+ cat nohup.out
diff --git a/.gitignore b/.gitignore
index ef780a61..c8821355 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ notebooks/iceberg-*-runtime-*
notebooks/hadoop-*
notebooks/apache-hive-*-bin
notebooks/metastore_db
+notebooks/hiveserver2.pid
notebooks/*.log
notebooks/*.out
# using sed on mac always needs a backup file
@@ -38,6 +39,9 @@ venv/
__pycache__/
.pytest_cache
+# pyenv
+.python-version
+
# Jetbrains IDEs
/.idea
*.iws
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index db3f99ce..42a0ed1c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -16,19 +16,19 @@
repos:
- repo: https://github.com/psf/black
- rev: 21.6b0
+ rev: 22.8.0
hooks:
- id: black
language_version: python3
- repo: https://github.com/asottile/reorder_python_imports
- rev: v2.5.0
+ rev: v3.12.0
hooks:
- id: reorder-python-imports
args:
- --py3-plus
- --application-directories=pynessie:tests:python/pynessie:python:tests
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.0.1
+ rev: v4.5.0
hooks:
- id: trailing-whitespace
- id: check-added-large-files
diff --git a/README.md b/README.md
index 136a9e19..72d1abaa 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,6 @@
These demos run under binder and can be found at:
* [Spark and Iceberg](https://mybinder.org/v2/gh/projectnessie/nessie-demos/main?labpath=notebooks%2Fnessie-iceberg-demo-nba.ipynb)
-* [Spark and Delta](https://mybinder.org/v2/gh/projectnessie/nessie-demos/main?labpath=notebooks%2Fnessie-delta-demo-nba.ipynb)
* [Flink and Iceberg](https://mybinder.org/v2/gh/projectnessie/nessie-demos/main?labpath=notebooks%2Fnessie-iceberg-flink-demo-nba.ipynb)
* [Hive and Iceberg](https://mybinder.org/v2/gh/projectnessie/nessie-demos/main?labpath=notebooks%2Fnessie-iceberg-hive-demo-nba.ipynb)
@@ -22,13 +21,7 @@ Nessie version is set in Binder at `docker/binder/requirements_base.txt`. Curren
### Iceberg
-Currently we are using Iceberg `0.13.1` and it is specified in both iceberg notebooks as well as `docker/utils/__init__.py`
-
-### Delta
-
-Currently, the Delta version is taken directly from the Nessie version and isn't explicitly noted. It is currently `1.1.0-nessie`
-
-See https://github.com/projectnessie/nessie/blob/nessie-0.30.0/pom.xml#L171
+Currently we are using Iceberg `1.4.2` and it is specified in both iceberg notebooks as well as `docker/utils/__init__.py`
### Spark
@@ -37,7 +30,7 @@ Only has to be updated in `docker/binder/requirements.txt`. Currently, Iceberg s
### Flink
-Flink version is set in Binder at `docker/binder/requirements_flink.txt`. Currently, we are using `1.13.6`.
+Flink version is set in Binder at `docker/binder/requirements_flink.txt`. Currently, we are using `1.17.1`.
### Hadoop
@@ -60,7 +53,7 @@ Of course, Binder just lets a user "simply start" a notebook via a simple "click
## Development
For development, you will need to make sure to have the following installed:
-- Python 3.7+
+- Python 3.10+
- pre-commit
Regarding pre-commit, you will need to make sure is installed through `pre-commit install` in order to install the hooks locally since this repo
diff --git a/binder/Dockerfile b/binder/Dockerfile
index 53efebc4..b0587230 100644
--- a/binder/Dockerfile
+++ b/binder/Dockerfile
@@ -2,7 +2,7 @@
# Tag will be automatically generated through pre-commit hook if any changes
# happened in the docker/ folder
-FROM ghcr.io/projectnessie/nessie-binder-demos:649ec80b8fa7d9666178380a33b2e645a52d5985
+FROM ghcr.io/projectnessie/nessie-binder-demos:dd32c4413d91c22676121f62119bcc7f167e4752
# Create the necessary folders for the demo, this will be created and owned by {NB_USER}
RUN mkdir -p notebooks && mkdir -p datasets
diff --git a/binder/README.md b/binder/README.md
index ba7274a6..2d1b1887 100644
--- a/binder/README.md
+++ b/binder/README.md
@@ -1,8 +1,8 @@
## Building binder locally
### Prerequisites
-You need to have a python 3.7+ installed.
-We recommend to use [pyenv](https://github.com/pyenv/pyenv) for managing your python environment(s).
+You need to have a python 3.10+ installed.
+We recommend to use [pyenv](https://github.com/pyenv/pyenv) for managing your python environment(s).
To build the binder image locally, firstly, you need to install `jupyter-repo2docker` dependency:
@@ -29,8 +29,8 @@ Run (or look into) the `build_run_local_docker.sh` script how to do this semi-au
After those steps, the binder should be running on your local machine.
Next, find the output similar to this:
```shell
-[C 13:38:25.199 NotebookApp]
-
+[C 13:38:25.199 NotebookApp]
+
To access the notebook, open this file in a browser:
file:///home/jovyan/.local/share/jupyter/runtime/nbserver-40-open.html
Or copy and paste this URL:
diff --git a/docker/binder/apt.txt b/docker/binder/apt.txt
index 84c10603..0ed4240f 100644
--- a/docker/binder/apt.txt
+++ b/docker/binder/apt.txt
@@ -16,9 +16,12 @@
# Packages needed for mybinder.org
+openjdk-8-jdk-headless
openjdk-11-jdk-headless
# SASL lib needed for thrift API to access Hive
libsasl2-dev
libsasl2-modules
# for removal of duplicate files
rdfind
+# need `netstat` for start scripts
+net-tools
diff --git a/docker/binder/postBuild b/docker/binder/postBuild
index d23de2c6..d7ba9374 100644
--- a/docker/binder/postBuild
+++ b/docker/binder/postBuild
@@ -26,7 +26,7 @@ python -m ipykernel install --name "flink-demo" --user
python -c "import utils;utils._copy_all_hadoop_jars_to_pyflink()"
conda deactivate
-python -c "import utils;utils.fetch_nessie()"
+python -c "import utils;utils.fetch_nessie_jar()"
python -c "import utils;utils.fetch_spark()"
diff --git a/docker/binder/requirements.txt b/docker/binder/requirements.txt
index a95841b6..065e6ff7 100644
--- a/docker/binder/requirements.txt
+++ b/docker/binder/requirements.txt
@@ -1,5 +1,7 @@
-r requirements_base.txt
findspark==2.0.1
-pandas==1.3.5
-pyhive[hive]==0.6.5
-pyspark==3.2.1
+# Need this numpy version due to compatibility reasons with numpy/pyspark
+numpy==1.21.6
+pandas==1.5.3
+pyhive[hive_pure_sasl]==0.7.0
+pyspark==3.2.4
diff --git a/docker/binder/requirements_base.txt b/docker/binder/requirements_base.txt
index b842f398..303085b3 100644
--- a/docker/binder/requirements_base.txt
+++ b/docker/binder/requirements_base.txt
@@ -1 +1 @@
-pynessie==0.30.0
+pynessie==0.65.0
diff --git a/docker/binder/requirements_flink.txt b/docker/binder/requirements_flink.txt
index 3e1775e6..664b2d3f 100644
--- a/docker/binder/requirements_flink.txt
+++ b/docker/binder/requirements_flink.txt
@@ -1,4 +1,2 @@
-r requirements_base.txt
-apache-flink==1.13.6
-# flink requires pandas<1.2.0 see https://github.com/apache/flink/blob/release-1.13.6/flink-python/setup.py#L313
-pandas==1.1.5
+apache-flink==1.17.1
diff --git a/docker/binder/runtime.txt b/docker/binder/runtime.txt
new file mode 100644
index 00000000..55090899
--- /dev/null
+++ b/docker/binder/runtime.txt
@@ -0,0 +1 @@
+python-3.10
diff --git a/docker/binder/start b/docker/binder/start
index 18c41941..6531ae7e 100755
--- a/docker/binder/start
+++ b/docker/binder/start
@@ -15,19 +15,33 @@
# limitations under the License.
#
-nohup ./nessie-quarkus-runner &
-
SPARK_VERSION=$(python -c "import utils;print(utils._SPARK_VERSION)")
HADOOP_VERSION=$(python -c "import utils;print(utils._HADOOP_VERSION)")
HIVE_VERSION=$(python -c "import utils;print(utils._HIVE_VERSION)")
-export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
+export JAVA11_HOME=/usr/lib/jvm/java-11-openjdk-amd64
+export JAVA8_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+export JAVA_HOME=$JAVA11_HOME
+export PATH=$JAVA_HOME/bin:$PATH
+
+nohup java -jar nessie-quarkus-runner.jar > nohup-nessie.out &
+# Wait until Nessie is ready to accept requests.
+echo "Waiting up to 180 seconds for Nessie to be ready..."
+for _ in seq 1 180 ; do
+ (netstat -ant | grep -q ':19120 .* LISTEN') && break
+ sleep 1
+done
+if [[ $(netstat -ant | grep -q ':19120 .* LISTEN') -gt 0 ]] ; then
+ echo "Nessie did not start / not listening on port 19120!"
+ exit 1
+fi
+echo "Nessie listening on port 19120."
export SPARK_HOME=$PWD/spark-$SPARK_VERSION-bin-hadoop3.2
export HADOOP_HOME=$PWD/hadoop-$HADOOP_VERSION
#Start Hive
chmod +x $PWD/binder/start.hive
-nohup $PWD/binder/start.hive $PWD $PWD/binder/resources $HIVE_VERSION
+nohup $PWD/binder/start.hive $PWD $PWD/binder/resources $HIVE_VERSION > nohup-hive.out
exec "$@"
diff --git a/docker/binder/start.hive b/docker/binder/start.hive
index c8ddd968..22f0e7e2 100755
--- a/docker/binder/start.hive
+++ b/docker/binder/start.hive
@@ -20,6 +20,8 @@ RESOURCE_DIR=$2
HIVE_VERSION=$3
HIVE_FOLDER_NAME="apache-hive-$HIVE_VERSION-bin"
HIVE_WAREHOUSE_DIR=$HIVE_PARENT_DIR/hive_warehouse
+HIVE_PID_FILE=$HIVE_PARENT_DIR/hiveserver2.pid
+HIVE_DB=$HIVE_PARENT_DIR/metastore_db
if [ -z "$HIVE_PARENT_DIR" ]; then
echo "Input the parent dir as the first argument"
@@ -38,15 +40,45 @@ fi
export HIVE_HOME=$HIVE_PARENT_DIR/$HIVE_FOLDER_NAME
-# Create hive warehouse folder
-mkdir $HIVE_WAREHOUSE_DIR
-
# Copy the needed configs to Hive folder
cp $RESOURCE_DIR/hive/config/hive-site.xml ${HIVE_HOME}/conf/
# Set Hive warehouse path in the hive-site.xml
sed -i.bak "s~HIVE_WAREHOUSE_DIR~$HIVE_WAREHOUSE_DIR~g" ${HIVE_HOME}/conf/hive-site.xml
+# Check for Java 8 + 11 for tox (also in /notebooks/tests/scripts/start_hive)
+if [[ -z ${JAVA8_HOME} || -z ${JAVA11_HOME} || ! -d ${JAVA8_HOME} || ! -d ${JAVA11_HOME} ]] ; then
+ cat < /dev/stderr
+
+
+============================================================================================================
+Define the JAVA8_HOME and JAVA11_HOME environment variables to point to Java 8 and Java 11 development kits.
+============================================================================================================
+
+Need Java 8 for Hive server to work.
+Java 11 (not newer!) is required for Spark, but also Nessie.
+
+
+!
+ exit 1
+fi
+
+# Kill an already running hiveserver
+if [[ -f $HIVE_PID_FILE ]] ; then
+ kill "$(cat $HIVE_PID_FILE)" || true
+ rm $HIVE_PID_FILE
+fi
+
+# Remove an already metastore-db
+if [[ -d $HIVE_DB ]] ; then
+ echo "Removing existing $HIVE_DB"
+ rm -rf $HIVE_DB
+fi
+
+# (Re-)create hive warehouse folder
+rm -rf $HIVE_WAREHOUSE_DIR
+mkdir -p $HIVE_WAREHOUSE_DIR
+
# Initialize Hive's Derby database
$HIVE_HOME/bin/schematool -dbType derby -initSchema
echo "Finished initializing Derby database for Hive."
@@ -54,5 +86,38 @@ echo "Finished initializing Derby database for Hive."
# increase the Heap memory being used by Hive-MapReduce jobs
export HADOOP_HEAPSIZE=1500
+# Use Java 8 for Hive :facepalm:
+OLD_PATH="$PATH"
+export PATH="$JAVA8_HOME/bin:$PATH"
+export JAVA_HOME=$JAVA8_HOME
+cat < $HIVE_PID_FILE
+echo "... PID is $(cat $HIVE_PID_FILE)"
+
+# Wait until Hive is ready to accept requests via Thrift. Hive may take some time to start in CI.
+echo "Waiting up to 180 seconds for Hive to be ready..."
+for _ in seq 1 180 ; do
+ (netstat -ant | grep -q ':10000 .* LISTEN') && break
+ sleep 1
+done
+if [[ $(netstat -ant | grep -q ':10000 .* LISTEN') -gt 0 ]] ; then
+ echo "Hive did not start / not listening on port 10000 (Thrift)!"
+ exit 1
+fi
+echo "Hive listening on port 10000 (Thrift)."
+
+# Reset environment
+export JAVA_HOME=$JAVA11_HOME
+export PATH=$OLD_PATH
diff --git a/docker/utils/__init__.py b/docker/utils/__init__.py
index 4434fbd7..2907642f 100644
--- a/docker/utils/__init__.py
+++ b/docker/utils/__init__.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
#
# Copyright (C) 2020 Dremio
#
@@ -18,7 +19,6 @@
import os
import shutil
import site
-import stat
import sysconfig
import tarfile
from typing import Optional
@@ -36,14 +36,18 @@
_SPARK_FILENAME = None
_SPARK_URL = None
-_HADOOP_VERSION = "2.10.1"
+_NESSIE_VERSION = "0.74.0"
+
+_HADOOP_VERSION = "2.10.2"
_HADOOP_FILENAME = f"hadoop-{_HADOOP_VERSION}"
_HADOOP_URL = f"https://archive.apache.org/dist/hadoop/common/hadoop-{_HADOOP_VERSION}/{_HADOOP_FILENAME}.tar.gz"
-_FLINK_MAJOR_VERSION = "1.13"
+_FLINK_MAJOR_VERSION = "1.17"
-_ICEBERG_VERSION = "0.13.1"
-_ICEBERG_FLINK_FILENAME = f"iceberg-flink-runtime-{_FLINK_MAJOR_VERSION}-{_ICEBERG_VERSION}.jar"
+_ICEBERG_VERSION = "1.4.2"
+_ICEBERG_FLINK_FILENAME = (
+ f"iceberg-flink-runtime-{_FLINK_MAJOR_VERSION}-{_ICEBERG_VERSION}.jar"
+)
_ICEBERG_FLINK_URL = f"https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-{_FLINK_MAJOR_VERSION}/{_ICEBERG_VERSION}/{_ICEBERG_FLINK_FILENAME}"
_ICEBERG_HIVE_FILENAME = f"iceberg-hive-runtime-{_ICEBERG_VERSION}.jar"
_ICEBERG_HIVE_URL = f"https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-hive-runtime/{_ICEBERG_VERSION}/{_ICEBERG_HIVE_FILENAME}"
@@ -55,7 +59,9 @@
)
-def _link_file_into_dir(source_file: str, target_dir: str, replace_if_exists=True) -> None:
+def _link_file_into_dir(
+ source_file: str, target_dir: str, replace_if_exists=True
+) -> None:
assert os.path.isfile(source_file)
assert os.path.isdir(target_dir)
@@ -75,7 +81,7 @@ def _link_file_into_dir(source_file: str, target_dir: str, replace_if_exists=Tru
os.link(source_file, target_file)
assert os.path.isfile(target_file), (source_file, target_file)
- action = 'replaced' if replaced else 'created'
+ action = "replaced" if replaced else "created"
print(f"Link target was {action}: {target_file} (source: {source_file})")
@@ -112,7 +118,9 @@ def _copy_all_hadoop_jars_to_pyflink() -> None:
pyflink_lib_dir = _find_pyflink_lib_dir()
for _jar_count, jar in enumerate(_jar_files()):
_link_file_into_dir(jar, pyflink_lib_dir)
- print(f"Linked {_jar_count} HADOOP jar files into the pyflink lib dir at location {pyflink_lib_dir}")
+ print(
+ f"Linked {_jar_count} HADOOP jar files into the pyflink lib dir at location {pyflink_lib_dir}"
+ )
def _find_pyflink_lib_dir() -> Optional[str]:
@@ -139,16 +147,6 @@ def _download_file(filename: str, url: str) -> None:
f.write(r.content)
-def fetch_nessie() -> str:
- """Download nessie executable."""
- runner = "nessie-quarkus-runner"
-
- url = _get_base_nessie_url()
- _download_file(runner, url)
- os.chmod(runner, os.stat(runner).st_mode | stat.S_IXUSR)
- return runner
-
-
def fetch_nessie_jar() -> str:
"""Download nessie Jar in order to run the tests in Mac"""
runner = "nessie-quarkus-runner.jar"
@@ -159,12 +157,8 @@ def fetch_nessie_jar() -> str:
def _get_base_nessie_url() -> str:
- import pynessie
-
- version = pynessie.__version__
-
return "https://github.com/projectnessie/nessie/releases/download/nessie-{}/nessie-quarkus-{}-runner".format(
- version, version
+ _NESSIE_VERSION, _NESSIE_VERSION
)
diff --git a/notebooks/nessie-delta-demo-nba.ipynb b/notebooks/nessie-delta-demo-nba.ipynb
deleted file mode 100644
index 6d8f5e2e..00000000
--- a/notebooks/nessie-delta-demo-nba.ipynb
+++ /dev/null
@@ -1,1666 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Nessie Spark SQL Demo with NBA Dataset\n",
- "============================\n",
- "This demo showcases how to use Nessie Python API along with Spark3 from Delta Lake\n",
- "\n",
- "Initialize Pyspark\n",
- "----------------------------------------------\n",
- "To get started, we will first have to do a few setup steps that give us everything we need\n",
- "to get started with Nessie. In case you're interested in the detailed setup steps for Spark, you can check out the [docs](https://projectnessie.org/tools/deltalake/spark/)\n",
- "\n",
- "The Binder server has downloaded spark and some data for us as well as started a Nessie server in the background. All we have to do is start Spark\n",
- "\n",
- "The below cell starts a local Spark session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "WARNING: An illegal reflective access operation has occurred\n",
- "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
- "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
- "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
- "WARNING: All illegal access operations will be denied in a future release\n",
- "https://storage.googleapis.com/nessie-maven added as a remote repository with the name: repo-1\n",
- "Ivy Default Cache set to: /home/jovyan/.ivy2/cache\n",
- "The jars for the packages stored in: /home/jovyan/.ivy2/jars\n",
- "org.projectnessie#nessie-deltalake added as a dependency\n",
- "org.projectnessie#nessie-spark-3.2-extensions added as a dependency\n",
- ":: resolving dependencies :: org.apache.spark#spark-submit-parent-2ab7f1e0-01bb-42fd-bb2f-6c1b59cdc6dd;1.0\n",
- "\tconfs: [default]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- ":: loading settings :: url = jar:file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\tfound org.projectnessie#nessie-deltalake;0.30.0 in central\n",
- "\tfound org.antlr#antlr4-runtime;4.9.2 in central\n",
- "\tfound org.projectnessie#nessie-spark-3.2-extensions;0.30.0 in central\n",
- "downloading https://repo1.maven.org/maven2/org/projectnessie/nessie-deltalake/0.30.0/nessie-deltalake-0.30.0.jar ...\n",
- "\t[SUCCESSFUL ] org.projectnessie#nessie-deltalake;0.30.0!nessie-deltalake.jar (375ms)\n",
- "downloading https://repo1.maven.org/maven2/org/projectnessie/nessie-spark-3.2-extensions/0.30.0/nessie-spark-3.2-extensions-0.30.0.jar ...\n",
- "\t[SUCCESSFUL ] org.projectnessie#nessie-spark-3.2-extensions;0.30.0!nessie-spark-3.2-extensions.jar (87ms)\n",
- "downloading https://repo1.maven.org/maven2/org/antlr/antlr4-runtime/4.9.2/antlr4-runtime-4.9.2.jar ...\n",
- "\t[SUCCESSFUL ] org.antlr#antlr4-runtime;4.9.2!antlr4-runtime.jar (59ms)\n",
- ":: resolution report :: resolve 19292ms :: artifacts dl 524ms\n",
- "\t:: modules in use:\n",
- "\torg.antlr#antlr4-runtime;4.9.2 from central in [default]\n",
- "\torg.projectnessie#nessie-deltalake;0.30.0 from central in [default]\n",
- "\torg.projectnessie#nessie-spark-3.2-extensions;0.30.0 from central in [default]\n",
- "\t---------------------------------------------------------------------\n",
- "\t| | modules || artifacts |\n",
- "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n",
- "\t---------------------------------------------------------------------\n",
- "\t| default | 3 | 3 | 3 | 0 || 3 | 3 |\n",
- "\t---------------------------------------------------------------------\n",
- ":: retrieving :: org.apache.spark#spark-submit-parent-2ab7f1e0-01bb-42fd-bb2f-6c1b59cdc6dd\n",
- "\tconfs: [default]\n",
- "\t3 artifacts copied, 0 already retrieved (4023kB/6ms)\n",
- "22/05/24 07:49:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
- "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
- "Setting default log level to \"WARN\".\n",
- "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Spark Running\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "import findspark\n",
- "from pyspark.sql import *\n",
- "from pyspark import SparkConf\n",
- "import pynessie\n",
- "\n",
- "findspark.init()\n",
- "pynessie_version = pynessie.__version__\n",
- "\n",
- "warehouse = \"file://\" + os.getcwd() + \"/spark_warehouse/delta\"\n",
- "conf = SparkConf()\n",
- "# we add our custom fork of delta to the known repositories\n",
- "conf.set(\"spark.jars.repositories\", \"https://storage.googleapis.com/nessie-maven\")\n",
- "# we need delta libraries and the nessie sql extensions\n",
- "conf.set(\n",
- " \"spark.jars.packages\",\n",
- " f\"org.projectnessie:nessie-deltalake:{pynessie_version},org.projectnessie:nessie-spark-3.2-extensions:{pynessie_version}\",\n",
- ")\n",
- "# ensure python <-> java interactions are w/ pyarrow\n",
- "conf.set(\"spark.sql.execution.pyarrow.enabled\", \"true\")\n",
- "# create catalog dev_catalog as a Delta catalog\n",
- "conf.set(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\")\n",
- "# set the location for Nessie catalog to store data. Spark writes to this directory\n",
- "conf.set(\"spark.sql.catalog.spark_catalog.warehouse\", warehouse)\n",
- "# set the location of the nessie server. In this demo its running locally. There are many ways to run it (see https://projectnessie.org/try/)\n",
- "conf.set(\"spark.sql.catalog.spark_catalog.uri\", \"http://localhost:19120/api/v1\")\n",
- "# default branch for Nessie catalog to work on\n",
- "conf.set(\"spark.sql.catalog.spark_catalog.ref\", \"main\")\n",
- "# use no authorization. Options are NONE AWS BASIC and aws implies running Nessie on a lambda\n",
- "conf.set(\"spark.sql.catalog.spark_catalog.auth_type\", \"NONE\")\n",
- "# These two lines tell Delta to use Nessie as the internal storage handler thereby enabling Delta/Nessie integraton\n",
- "conf.set(\"spark.delta.logFileHandler.class\", \"org.projectnessie.deltalake.NessieLogFileMetaParser\")\n",
- "conf.set(\"spark.delta.logStore.class\", \"org.projectnessie.deltalake.NessieLogStore\")\n",
- "# enable the extensions for both Nessie and Delta\n",
- "conf.set(\n",
- " \"spark.sql.extensions\",\n",
- " \"io.delta.sql.DeltaSparkSessionExtension,org.projectnessie.spark.extensions.NessieSpark32SessionExtensions\",\n",
- ")\n",
- "# finally, start up the Spark server\n",
- "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
- "print(\"Spark Running\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Solving Data Engineering problems with Nessie\n",
- "============================\n",
- "\n",
- "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n",
- "\n",
- "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Set up Nessie branches\n",
- "----------------------------\n",
- "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n",
- "\n",
- "- Create a new branch named `dev`\n",
- "- List all branches\n",
- "\n",
- "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " dev | \n",
- " 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"CREATE BRANCH dev FROM main\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n",
- "\n",
- "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " dev | \n",
- " 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Branch | \n",
- " main | \n",
- " 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...\n",
- "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"LIST REFERENCES\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create tables under dev branch\n",
- "-------------------------------------\n",
- "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n",
- "\n",
- "We create two tables under the `dev` branch:\n",
- "- `salaries`\n",
- "- `totals_stats`\n",
- "\n",
- "These tables list the salaries per player per year and their stats per year.\n",
- "\n",
- "To create the data we:\n",
- "\n",
- "1. switch our branch context to dev\n",
- "2. create the table\n",
- "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- " \r"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE dev\")\n",
- "\n",
- "# Creating `salaries` table\n",
- "spark.sql(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS salaries (Season STRING, Team STRING, Salary STRING, Player STRING)\n",
- " USING delta LOCATION '{}/salaries'\"\"\".format(\n",
- " warehouse\n",
- " )\n",
- ")\n",
- "\n",
- "spark.sql(\n",
- " \"\"\"CREATE OR REPLACE TEMPORARY VIEW salaries_table USING csv\n",
- " OPTIONS (path \"../datasets/nba/salaries.csv\", header true)\"\"\"\n",
- ")\n",
- "spark.sql(\"INSERT INTO salaries SELECT * FROM salaries_table\")\n",
- "\n",
- "# Creating `totals_stats` table\n",
- "spark.sql(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS totals_stats (\n",
- " Season STRING, Age STRING, Team STRING, ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING,\n",
- " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n",
- " USING delta LOCATION '{}/totals_stats'\"\"\".format(\n",
- " warehouse\n",
- " )\n",
- ")\n",
- "spark.sql(\n",
- " \"\"\"CREATE OR REPLACE TEMPORARY VIEW stats_table USING csv\n",
- " OPTIONS (path \"../datasets/nba/totals_stats.csv\", header true)\"\"\"\n",
- ")\n",
- "spark.sql(\"INSERT INTO totals_stats SELECT * FROM stats_table\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now we count the rows in our tables to ensure they are the same number as the csv files."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "50\n",
- "92\n"
- ]
- }
- ],
- "source": [
- "table_count = spark.sql(\"select count(*) from salaries\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)\n",
- "\n",
- "table_count = spark.sql(\"select count(*) from totals_stats\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check generated tables\n",
- "----------------------------\n",
- "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n",
- "let's verify that the `main` branch was not altered by our changes.\n",
- "\n",
- "Note: `SHOW TABLES` does not work on Delta because the Delta Catalog has no concept of references. We have to use the command line instead.\n",
- "In this demo we are switching the reference around regularly which means `SHOW TABLES` isn't always reliable. In the situation where\n",
- "your Spark job is only using one reference we can safely call `SHOW TABLES`."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And on the `dev` branch we expect to see two tables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "DELTA_LAKE_TABLE:\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list --ref dev"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "We can also verify that the `dev` and `main` branches point to different commits"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " dev | \n",
- " 54622696d1313cfcb012120083d917f65558f0906f73ab... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Branch | \n",
- " main | \n",
- " 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch dev 54622696d1313cfcb012120083d917f65558f0906f73ab...\n",
- "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"LIST REFERENCES\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Dev promotion into main\n",
- "-----------------------\n",
- "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n",
- "We merge `dev` into `main` via the Spark sql `merge` command.\n",
- "Both branches should be at the same revision after merging/promotion."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " main | \n",
- " dee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name hash\n",
- "0 main dee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"MERGE BRANCH dev\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can verify that the `main` branch now contains the expected tables and row counts.\n",
- "\n",
- "The tables are now on `main` and ready for consumption by our blog authors and analysts!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " main | \n",
- " dee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Branch | \n",
- " dev | \n",
- " 54622696d1313cfcb012120083d917f65558f0906f73ab... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch main dee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921...\n",
- "1 Branch dev 54622696d1313cfcb012120083d917f65558f0906f73ab..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"LIST REFERENCES\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "DELTA_LAKE_TABLE:\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "50\n",
- "92\n"
- ]
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE main\")\n",
- "table_count = spark.sql(\"select count(*) from salaries\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)\n",
- "\n",
- "table_count = spark.sql(\"select count(*) from totals_stats\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "Perform regular ETL on the new tables\n",
- "-------------------\n",
- "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n",
- "\n",
- "1. Update the salaries table to add new data\n",
- "2. We add `Years` column to `totals_stats` table to show how many years the player was in the league\n",
- "3. We create a new table to hold information about the players appearances in all star games\n",
- "\n",
- "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " etl | \n",
- " dee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch etl dee6e3ec017cd39a272ccd4599bc2f1d4679731bd7b921..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"CREATE BRANCH etl FROM main\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# add some salaries for Kevin Durant\n",
- "spark.sql(\"USE REFERENCE etl\")\n",
- "spark.sql(\n",
- " \"\"\"INSERT INTO salaries VALUES\n",
- " (\"2017-18\", \"Golden State Warriors\", \"$25000000\", \"Kevin Durant\"),\n",
- " (\"2018-19\", \"Golden State Warriors\", \"$30000000\", \"Kevin Durant\"),\n",
- " (\"2019-20\", \"Brooklyn Nets\", \"$37199000\", \"Kevin Durant\"),\n",
- " (\"2020-21\", \"Brooklyn Nets\", \"$39058950\", \"Kevin Durant\")\n",
- " \"\"\"\n",
- ").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Adding a column in the `totals_stats` table\n",
- "spark.sql(\"ALTER TABLE totals_stats ADD COLUMNS (Years STRING)\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " count(1) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 47 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " count(1)\n",
- "0 47"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Creating `allstar_games_stats` table and viewing the contents\n",
- "spark.sql(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS allstar_games_stats (\n",
- " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING,\n",
- " TOV STRING, PF STRING, PTS STRING, Player STRING)\n",
- " USING delta LOCATION '{}/allstar_stats'\"\"\".format(\n",
- " warehouse\n",
- " )\n",
- ")\n",
- "spark.sql(\n",
- " \"\"\"CREATE OR REPLACE TEMPORARY VIEW allstar_table USING csv\n",
- " OPTIONS (path \"../datasets/nba/allstar_games_stats.csv\", header true)\"\"\"\n",
- ")\n",
- "spark.sql(\"INSERT INTO allstar_games_stats SELECT * FROM allstar_table\").toPandas()\n",
- "\n",
- "# notice how we view the data on the etl branch via @etl\n",
- "spark.sql(\"select count(*) from allstar_games_stats\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can verify that the new table isn't on the `main` branch but is present on the etl branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "DELTA_LAKE_TABLE:\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "DELTA_LAKE_TABLE:\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.allstar_stats._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list --ref etl"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that we are happy with the data we can again merge it into `main`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " main | \n",
- " cdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name hash\n",
- "0 main cdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"MERGE BRANCH etl\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now lets verify that the changes exist on the `main` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "DELTA_LAKE_TABLE:\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.allstar_stats._delta_log\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " main | \n",
- " cdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Branch | \n",
- " etl | \n",
- " 2d11823828ee539d7609e1a88083ada6f37d39362a4e3a... | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Branch | \n",
- " dev | \n",
- " 54622696d1313cfcb012120083d917f65558f0906f73ab... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch main cdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9...\n",
- "1 Branch etl 2d11823828ee539d7609e1a88083ada6f37d39362a4e3a...\n",
- "2 Branch dev 54622696d1313cfcb012120083d917f65558f0906f73ab..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"LIST REFERENCES\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "47\n"
- ]
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE main\").toPandas()\n",
- "table_count = spark.sql(\"select count(*) from allstar_games_stats\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from allstar_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create `experiment` branch\n",
- "--------------------------------\n",
- "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n",
- "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n",
- "and carry out our experiment, which could consist of the following steps:\n",
- "- drop `totals_stats` table\n",
- "- add data to `salaries` table\n",
- "- compare `experiment` and `main` tables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " experiment | \n",
- " cdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch experiment cdf244cd4af77968becc0ebd0439efea6c6e6df8923ed9..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"CREATE BRANCH experiment FROM main\").toPandas()\n",
- "spark.sql(\"USE REFERENCE experiment\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Drop the `totals_stats` table on the `experiment` branch\n",
- "spark.sql(\"DROP TABLE totals_stats\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# add some salaries for Dirk Nowitzki\n",
- "spark.sql(\n",
- " \"\"\"INSERT INTO salaries VALUES\n",
- " (\"2015-16\", \"Dallas Mavericks\", \"$8333333\", \"Dirk Nowitzki\"),\n",
- " (\"2016-17\", \"Dallas Mavericks\", \"$25000000\", \"Dirk Nowitzki\"),\n",
- " (\"2017-28\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\"),\n",
- " (\"2018-19\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\")\n",
- " \"\"\"\n",
- ").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "DELTA_LAKE_TABLE:\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.allstar_stats._delta_log\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list --ref experiment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "DELTA_LAKE_TABLE:\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.salaries._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.totals_stats._delta_log\n",
- "\thome.jovyan.notebooks.spark_warehouse.delta.allstar_stats._delta_log\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n",
- "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " count(1) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 58 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " count(1)\n",
- "0 58"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"select count(*) from salaries\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n",
- "to the `main` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " count(1) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 54 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " count(1)\n",
- "0 54"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE main\").toPandas()\n",
- "spark.sql(\"select count(*) from salaries\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "And finally lets clean up after ourselves"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "DataFrame[status: string]"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"DROP BRANCH dev\")\n",
- "spark.sql(\"DROP BRANCH etl\")\n",
- "spark.sql(\"DROP BRANCH experiment\")"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.12"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
diff --git a/notebooks/nessie-iceberg-demo-nba.ipynb b/notebooks/nessie-iceberg-demo-nba.ipynb
index 7b2632a0..051ddec8 100644
--- a/notebooks/nessie-iceberg-demo-nba.ipynb
+++ b/notebooks/nessie-iceberg-demo-nba.ipynb
@@ -1,1996 +1,668 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Nessie Spark SQL Demo with NBA Dataset\n",
- "============================\n",
- "This demo showcases how to use Nessie Python API along with Spark3 from Iceberg\n",
- "\n",
- "Initialize Pyspark\n",
- "----------------------------------------------\n",
- "To get started, we will first have to do a few setup steps that give us everything we need\n",
- "to get started with Nessie. In case you're interested in the detailed setup steps for Spark, you can check out the [docs](https://projectnessie.org/tools/iceberg/spark/).\n",
- "\n",
- "The Binder server has downloaded spark and some data for us as well as started a Nessie server in the background. All we have to do is start Spark.\n",
- "\n",
- "The below cell starts a local Spark session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "WARNING: An illegal reflective access operation has occurred\n",
- "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
- "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
- "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
- "WARNING: All illegal access operations will be denied in a future release\n",
- "Ivy Default Cache set to: /home/jovyan/.ivy2/cache\n",
- "The jars for the packages stored in: /home/jovyan/.ivy2/jars\n",
- "org.apache.iceberg#iceberg-spark-runtime-3.2_2.12 added as a dependency\n",
- "org.projectnessie#nessie-spark-3.2-extensions added as a dependency\n",
- ":: resolving dependencies :: org.apache.spark#spark-submit-parent-6cba98e4-6e15-458e-a366-568683d289f7;1.0\n",
- "\tconfs: [default]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- ":: loading settings :: url = jar:file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n"
- ]
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\tfound org.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.13.1 in central\n",
- "\tfound org.projectnessie#nessie-spark-3.2-extensions;0.30.0 in central\n",
- "downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/0.13.1/iceberg-spark-runtime-3.2_2.12-0.13.1.jar ...\n",
- "\t[SUCCESSFUL ] org.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.13.1!iceberg-spark-runtime-3.2_2.12.jar (1331ms)\n",
- "downloading https://repo1.maven.org/maven2/org/projectnessie/nessie-spark-3.2-extensions/0.30.0/nessie-spark-3.2-extensions-0.30.0.jar ...\n",
- "\t[SUCCESSFUL ] org.projectnessie#nessie-spark-3.2-extensions;0.30.0!nessie-spark-3.2-extensions.jar (70ms)\n",
- ":: resolution report :: resolve 13309ms :: artifacts dl 1405ms\n",
- "\t:: modules in use:\n",
- "\torg.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.13.1 from central in [default]\n",
- "\torg.projectnessie#nessie-spark-3.2-extensions;0.30.0 from central in [default]\n",
- "\t---------------------------------------------------------------------\n",
- "\t| | modules || artifacts |\n",
- "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n",
- "\t---------------------------------------------------------------------\n",
- "\t| default | 2 | 2 | 2 | 0 || 2 | 2 |\n",
- "\t---------------------------------------------------------------------\n",
- ":: retrieving :: org.apache.spark#spark-submit-parent-6cba98e4-6e15-458e-a366-568683d289f7\n",
- "\tconfs: [default]\n",
- "\t2 artifacts copied, 0 already retrieved (22360kB/20ms)\n",
- "22/05/24 07:43:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
- "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
- "Setting default log level to \"WARN\".\n",
- "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Spark Running\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "import findspark\n",
- "from pyspark.sql import *\n",
- "from pyspark import SparkConf\n",
- "import pynessie\n",
- "\n",
- "findspark.init()\n",
- "pynessie_version = pynessie.__version__\n",
- "\n",
- "conf = SparkConf()\n",
- "# we need iceberg libraries and the nessie sql extensions\n",
- "conf.set(\n",
- " \"spark.jars.packages\",\n",
- " f\"org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.1,org.projectnessie:nessie-spark-3.2-extensions:{pynessie_version}\",\n",
- ")\n",
- "# ensure python <-> java interactions are w/ pyarrow\n",
- "conf.set(\"spark.sql.execution.pyarrow.enabled\", \"true\")\n",
- "# create catalog dev_catalog as an iceberg catalog\n",
- "conf.set(\"spark.sql.catalog.dev_catalog\", \"org.apache.iceberg.spark.SparkCatalog\")\n",
- "# tell the dev_catalog that its a Nessie catalog\n",
- "conf.set(\"spark.sql.catalog.dev_catalog.catalog-impl\", \"org.apache.iceberg.nessie.NessieCatalog\")\n",
- "# set the location for Nessie catalog to store data. Spark writes to this directory\n",
- "conf.set(\"spark.sql.catalog.dev_catalog.warehouse\", \"file://\" + os.getcwd() + \"/spark_warehouse/iceberg\")\n",
- "# set the location of the nessie server. In this demo its running locally. There are many ways to run it (see https://projectnessie.org/try/)\n",
- "conf.set(\"spark.sql.catalog.dev_catalog.uri\", \"http://localhost:19120/api/v1\")\n",
- "# default branch for Nessie catalog to work on\n",
- "conf.set(\"spark.sql.catalog.dev_catalog.ref\", \"main\")\n",
- "# use no authorization. Options are NONE AWS BASIC and aws implies running Nessie on a lambda\n",
- "conf.set(\"spark.sql.catalog.dev_catalog.auth_type\", \"NONE\")\n",
- "# enable the extensions for both Nessie and Iceberg\n",
- "conf.set(\n",
- " \"spark.sql.extensions\",\n",
- " \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSpark32SessionExtensions\",\n",
- ")\n",
- "# finally, start up the Spark server\n",
- "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
- "print(\"Spark Running\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Solving Data Engineering problems with Nessie\n",
- "============================\n",
- "\n",
- "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n",
- "\n",
- "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Set up Nessie branches\n",
- "----------------------------\n",
- "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n",
- "\n",
- "- Create a new branch named `dev`\n",
- "- List all branches\n",
- "\n",
- "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " dev | \n",
- " 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"CREATE BRANCH dev IN dev_catalog FROM main\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n",
- "\n",
- "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " dev | \n",
- " 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Branch | \n",
- " main | \n",
- " 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...\n",
- "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create tables under dev branch\n",
- "-------------------------------------\n",
- "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n",
- "\n",
- "We create two tables under the `dev` branch:\n",
- "- `salaries`\n",
- "- `totals_stats`\n",
- "\n",
- "These tables list the salaries per player per year and their stats per year.\n",
- "\n",
- "To create the data we:\n",
- "\n",
- "1. switch our branch context to dev\n",
- "2. create the table\n",
- "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE dev IN dev_catalog\")\n",
- "\n",
- "# Creating `salaries` table\n",
- "spark.sql(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.salaries\n",
- " (Season STRING, Team STRING, Salary STRING, Player STRING) USING iceberg\"\"\"\n",
- ")\n",
- "\n",
- "spark.sql(\n",
- " \"\"\"CREATE OR REPLACE TEMPORARY VIEW salaries_table USING csv\n",
- " OPTIONS (path \"../datasets/nba/salaries.csv\", header true)\"\"\"\n",
- ")\n",
- "spark.sql(\"INSERT INTO dev_catalog.nba.salaries SELECT * FROM salaries_table\")\n",
- "\n",
- "# Creating `totals_stats` table\n",
- "spark.sql(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.totals_stats (\n",
- " Season STRING, Age STRING, Team STRING, ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING,\n",
- " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n",
- " USING iceberg\"\"\"\n",
- ")\n",
- "spark.sql(\n",
- " \"\"\"CREATE OR REPLACE TEMPORARY VIEW stats_table USING csv\n",
- " OPTIONS (path \"../datasets/nba/totals_stats.csv\", header true)\"\"\"\n",
- ")\n",
- "spark.sql(\"INSERT INTO dev_catalog.nba.totals_stats SELECT * FROM stats_table\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by a `USE REFERENCE` command."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "50\n",
- "92\n"
- ]
- }
- ],
- "source": [
- "table_count = spark.sql(\"select count(*) from dev_catalog.nba.`salaries@dev`\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)\n",
- "\n",
- "table_count = spark.sql(\"select count(*) from dev_catalog.nba.`totals_stats@dev`\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check generated tables\n",
- "----------------------------\n",
- "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n",
- "let's verify that the `main` branch was not altered by our changes."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " namespace | \n",
- " tableName | \n",
- " isTemporary | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: [namespace, tableName, isTemporary]\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
- "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And on the `dev` branch we expect to see two tables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " namespace | \n",
- " tableName | \n",
- " isTemporary | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " nba | \n",
- " totals_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " nba | \n",
- " salaries | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " namespace tableName isTemporary\n",
- "0 nba totals_stats False\n",
- "1 nba salaries False"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE dev IN dev_catalog\").toPandas()\n",
- "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "We can also verify that the `dev` and `main` branches point to different commits"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " dev | \n",
- " 70a8df769b477de5b9157691edef1efca8a640ae9f7137... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Branch | \n",
- " main | \n",
- " 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch dev 70a8df769b477de5b9157691edef1efca8a640ae9f7137...\n",
- "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Dev promotion into main\n",
- "-----------------------\n",
- "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n",
- "We merge `dev` into `main` via the Spark sql `merge` command.\n",
- "Both branches should be at the same revision after merging/promotion."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " main | \n",
- " af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name hash\n",
- "0 main af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"MERGE BRANCH dev INTO main IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can verify that the `main` branch now contains the expected tables and row counts.\n",
- "\n",
- "The tables are now on `main` and ready for consumption by our blog authors and analysts!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " main | \n",
- " af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Branch | \n",
- " dev | \n",
- " 70a8df769b477de5b9157691edef1efca8a640ae9f7137... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch main af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38...\n",
- "1 Branch dev 70a8df769b477de5b9157691edef1efca8a640ae9f7137..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " namespace | \n",
- " tableName | \n",
- " isTemporary | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " nba | \n",
- " salaries | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " nba | \n",
- " totals_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " namespace tableName isTemporary\n",
- "0 nba salaries False\n",
- "1 nba totals_stats False"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
- "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "50\n",
- "92\n"
- ]
- }
- ],
- "source": [
- "table_count = spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)\n",
- "\n",
- "table_count = spark.sql(\"select count(*) from dev_catalog.nba.totals_stats\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "Perform regular ETL on the new tables\n",
- "-------------------\n",
- "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n",
- "\n",
- "1. Update the salaries table to add new data\n",
- "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n",
- "3. We create a new table to hold information about the players appearances in all star games\n",
- "\n",
- "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " etl | \n",
- " af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch etl af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"CREATE BRANCH etl IN dev_catalog FROM main\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# add some salaries for Kevin Durant\n",
- "spark.sql(\"USE REFERENCE etl IN dev_catalog\")\n",
- "spark.sql(\n",
- " \"\"\"INSERT INTO dev_catalog.nba.salaries VALUES\n",
- " (\"2017-18\", \"Golden State Warriors\", \"$25000000\", \"Kevin Durant\"),\n",
- " (\"2018-19\", \"Golden State Warriors\", \"$30000000\", \"Kevin Durant\"),\n",
- " (\"2019-20\", \"Brooklyn Nets\", \"$37199000\", \"Kevin Durant\"),\n",
- " (\"2020-21\", \"Brooklyn Nets\", \"$39058950\", \"Kevin Durant\")\n",
- " \"\"\"\n",
- ").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Dropping a column in the `totals_stats` table\n",
- "spark.sql(\"ALTER TABLE dev_catalog.nba.totals_stats DROP COLUMN Age\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " count(1) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 47 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " count(1)\n",
- "0 47"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Creating `allstar_games_stats` table and viewing the contents\n",
- "spark.sql(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.allstar_games_stats (\n",
- " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING,\n",
- " TOV STRING, PF STRING, PTS STRING, Player STRING)\n",
- " USING iceberg\"\"\"\n",
- ")\n",
- "spark.sql(\n",
- " \"\"\"CREATE OR REPLACE TEMPORARY VIEW allstar_table USING csv\n",
- " OPTIONS (path \"../datasets/nba/allstar_games_stats.csv\", header true)\"\"\"\n",
- ")\n",
- "spark.sql(\"INSERT INTO dev_catalog.nba.allstar_games_stats SELECT * FROM allstar_table\").toPandas()\n",
- "\n",
- "# notice how we view the data on the etl branch via @etl\n",
- "spark.sql(\"select count(*) from dev_catalog.nba.`allstar_games_stats@etl`\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can verify that the new table isn't on the `main` branch but is present on the etl branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " namespace | \n",
- " tableName | \n",
- " isTemporary | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " nba | \n",
- " salaries | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " nba | \n",
- " totals_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " namespace tableName isTemporary\n",
- "0 nba salaries False\n",
- "1 nba totals_stats False"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
- "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " namespace | \n",
- " tableName | \n",
- " isTemporary | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " nba | \n",
- " allstar_games_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " nba | \n",
- " totals_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " nba | \n",
- " salaries | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " namespace tableName isTemporary\n",
- "0 nba allstar_games_stats False\n",
- "1 nba totals_stats False\n",
- "2 nba salaries False"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE etl IN dev_catalog\").toPandas()\n",
- "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that we are happy with the data we can again merge it into `main`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " main | \n",
- " e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " name hash\n",
- "0 main e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"MERGE BRANCH etl INTO main IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now lets verify that the changes exist on the `main` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " namespace | \n",
- " tableName | \n",
- " isTemporary | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " nba | \n",
- " salaries | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " nba | \n",
- " allstar_games_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " nba | \n",
- " totals_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " namespace tableName isTemporary\n",
- "0 nba salaries False\n",
- "1 nba allstar_games_stats False\n",
- "2 nba totals_stats False"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
- "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " main | \n",
- " e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2... | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " Branch | \n",
- " etl | \n",
- " 957c1254ab0a3e3bd1e306669ebe7073e27a97966bcfda... | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " Branch | \n",
- " dev | \n",
- " 70a8df769b477de5b9157691edef1efca8a640ae9f7137... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch main e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2...\n",
- "1 Branch etl 957c1254ab0a3e3bd1e306669ebe7073e27a97966bcfda...\n",
- "2 Branch dev 70a8df769b477de5b9157691edef1efca8a640ae9f7137..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "47\n"
- ]
- }
- ],
- "source": [
- "table_count = spark.sql(\"select count(*) from dev_catalog.nba.allstar_games_stats\").toPandas().values[0][0]\n",
- "csv_count = spark.sql(\"select count(*) from allstar_table\").toPandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create `experiment` branch\n",
- "--------------------------------\n",
- "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n",
- "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n",
- "and carry out our experiment, which could consist of the following steps:\n",
- "- drop `totals_stats` table\n",
- "- add data to `salaries` table\n",
- "- compare `experiment` and `main` tables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " refType | \n",
- " name | \n",
- " hash | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " Branch | \n",
- " experiment | \n",
- " e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " refType name hash\n",
- "0 Branch experiment e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2..."
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"CREATE BRANCH experiment IN dev_catalog FROM main\").toPandas()\n",
- "spark.sql(\"USE REFERENCE experiment IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Drop the `totals_stats` table on the `experiment` branch\n",
- "spark.sql(\"DROP TABLE dev_catalog.nba.totals_stats\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- "Empty DataFrame\n",
- "Columns: []\n",
- "Index: []"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# add some salaries for Dirk Nowitzki\n",
- "spark.sql(\n",
- " \"\"\"INSERT INTO dev_catalog.nba.salaries VALUES\n",
- " (\"2015-16\", \"Dallas Mavericks\", \"$8333333\", \"Dirk Nowitzki\"),\n",
- " (\"2016-17\", \"Dallas Mavericks\", \"$25000000\", \"Dirk Nowitzki\"),\n",
- " (\"2017-28\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\"),\n",
- " (\"2018-19\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\")\n",
- " \"\"\"\n",
- ").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " namespace | \n",
- " tableName | \n",
- " isTemporary | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " nba | \n",
- " salaries | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " nba | \n",
- " allstar_games_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " namespace tableName isTemporary\n",
- "0 nba salaries False\n",
- "1 nba allstar_games_stats False"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " namespace | \n",
- " tableName | \n",
- " isTemporary | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " nba | \n",
- " salaries | \n",
- " False | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " nba | \n",
- " allstar_games_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " nba | \n",
- " totals_stats | \n",
- " False | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " namespace tableName isTemporary\n",
- "0 nba salaries False\n",
- "1 nba allstar_games_stats False\n",
- "2 nba totals_stats False"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
- "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n",
- "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " count(1) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 58 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " count(1)\n",
- "0 58"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"select count(*) from dev_catalog.nba.`salaries@experiment`\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n",
- "to the `main` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " count(1) | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 54 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " count(1)\n",
- "0 54"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "And finally lets clean up after ourselves"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "DataFrame[status: string]"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "spark.sql(\"DROP BRANCH dev IN dev_catalog\")\n",
- "spark.sql(\"DROP BRANCH etl IN dev_catalog\")\n",
- "spark.sql(\"DROP BRANCH experiment IN dev_catalog\")"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Nessie Spark SQL Demo with NBA Dataset\n",
+ "============================\n",
+ "This demo showcases how to use Nessie Python API along with Spark3 from Iceberg\n",
+ "\n",
+ "Initialize Pyspark\n",
+ "----------------------------------------------\n",
+ "To get started, we will first have to do a few setup steps that give us everything we need\n",
+ "to get started with Nessie. In case you're interested in the detailed setup steps for Spark, you can check out the [docs](https://projectnessie.org/tools/iceberg/spark/).\n",
+ "\n",
+ "The Binder server has downloaded spark and some data for us as well as started a Nessie server in the background. All we have to do is start Spark.\n",
+ "\n",
+ "The below cell starts a local Spark session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import findspark\n",
+ "from pyspark.sql import *\n",
+ "from pyspark import SparkConf\n",
+ "import pynessie\n",
+ "\n",
+ "findspark.init()\n",
+ "pynessie_version = pynessie.__version__\n",
+ "\n",
+ "conf = SparkConf()\n",
+ "# we need iceberg libraries and the nessie sql extensions\n",
+ "conf.set(\n",
+ " \"spark.jars.packages\",\n",
+ " f\"org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:1.4.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.2_2.12:0.74.0\",\n",
+ ")\n",
+ "# ensure python <-> java interactions are w/ pyarrow\n",
+ "conf.set(\"spark.sql.execution.pyarrow.enabled\", \"true\")\n",
+ "# create catalog dev_catalog as an iceberg catalog\n",
+ "conf.set(\"spark.sql.catalog.dev_catalog\", \"org.apache.iceberg.spark.SparkCatalog\")\n",
+ "# tell the dev_catalog that its a Nessie catalog\n",
+ "conf.set(\"spark.sql.catalog.dev_catalog.catalog-impl\", \"org.apache.iceberg.nessie.NessieCatalog\")\n",
+ "# set the location for Nessie catalog to store data. Spark writes to this directory\n",
+ "conf.set(\"spark.sql.catalog.dev_catalog.warehouse\", \"file://\" + os.getcwd() + \"/spark_warehouse/iceberg\")\n",
+ "# set the location of the nessie server. In this demo its running locally. There are many ways to run it (see https://projectnessie.org/try/)\n",
+ "conf.set(\"spark.sql.catalog.dev_catalog.uri\", \"http://localhost:19120/api/v1\")\n",
+ "# default branch for Nessie catalog to work on\n",
+ "conf.set(\"spark.sql.catalog.dev_catalog.ref\", \"main\")\n",
+ "# use no authorization. Options are NONE AWS BASIC and aws implies running Nessie on a lambda\n",
+ "conf.set(\"spark.sql.catalog.dev_catalog.auth_type\", \"NONE\")\n",
+ "# enable the extensions for both Nessie and Iceberg\n",
+ "conf.set(\n",
+ " \"spark.sql.extensions\",\n",
+ " \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions\",\n",
+ ")\n",
+ "# finally, start up the Spark server\n",
+ "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n",
+ "print(\"Spark Running\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Solving Data Engineering problems with Nessie\n",
+ "============================\n",
+ "\n",
+ "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n",
+ "\n",
+ "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Set up Nessie branches\n",
+ "----------------------------\n",
+ "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n",
+ "\n",
+ "- Create a new branch named `dev`\n",
+ "- List all branches\n",
+ "\n",
+ "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create the 'nba' namespace in Nessie\n",
+ "spark.sql(\"CREATE NAMESPACE dev_catalog.nba\")\n",
+ "\n",
+ "# Create the 'dev' branch from 'main' branch\n",
+ "spark.sql(\"CREATE BRANCH dev IN dev_catalog FROM main\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n",
+ "\n",
+ "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create tables under dev branch\n",
+ "-------------------------------------\n",
+ "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n",
+ "\n",
+ "We create two tables under the `dev` branch:\n",
+ "- `salaries`\n",
+ "- `totals_stats`\n",
+ "\n",
+ "These tables list the salaries per player per year and their stats per year.\n",
+ "\n",
+ "To create the data we:\n",
+ "\n",
+ "1. switch our branch context to dev\n",
+ "2. create the table\n",
+ "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"USE REFERENCE dev IN dev_catalog\")\n",
+ "\n",
+ "# Creating `salaries` table\n",
+ "spark.sql(\n",
+ " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.salaries\n",
+ " (Season STRING, Team STRING, Salary STRING, Player STRING) USING iceberg\"\"\"\n",
+ ")\n",
+ "\n",
+ "spark.sql(\n",
+ " \"\"\"CREATE OR REPLACE TEMPORARY VIEW salaries_table USING csv\n",
+ " OPTIONS (path \"../datasets/nba/salaries.csv\", header true)\"\"\"\n",
+ ")\n",
+ "spark.sql(\"INSERT INTO dev_catalog.nba.salaries SELECT * FROM salaries_table\")\n",
+ "\n",
+ "# Creating `totals_stats` table\n",
+ "spark.sql(\n",
+ " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.totals_stats (\n",
+ " Season STRING, Age STRING, Team STRING, ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING,\n",
+ " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n",
+ " USING iceberg\"\"\"\n",
+ ")\n",
+ "spark.sql(\n",
+ " \"\"\"CREATE OR REPLACE TEMPORARY VIEW stats_table USING csv\n",
+ " OPTIONS (path \"../datasets/nba/totals_stats.csv\", header true)\"\"\"\n",
+ ")\n",
+ "spark.sql(\"INSERT INTO dev_catalog.nba.totals_stats SELECT * FROM stats_table\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by a `USE REFERENCE` command."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "table_count = spark.sql(\"select count(*) from dev_catalog.nba.`salaries@dev`\").toPandas().values[0][0]\n",
+ "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)\n",
+ "\n",
+ "table_count = spark.sql(\"select count(*) from dev_catalog.nba.`totals_stats@dev`\").toPandas().values[0][0]\n",
+ "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Check generated tables\n",
+ "----------------------------\n",
+ "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n",
+ "let's verify that the `main` branch was not altered by our changes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
+ "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And on the `dev` branch we expect to see two tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"USE REFERENCE dev IN dev_catalog\").toPandas()\n",
+ "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "We can also verify that the `dev` and `main` branches point to different commits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Dev promotion into main\n",
+ "-----------------------\n",
+ "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n",
+ "We merge `dev` into `main` via the Spark sql `merge` command.\n",
+ "Both branches should be at the same revision after merging/promotion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"MERGE BRANCH dev INTO main IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can verify that the `main` branch now contains the expected tables and row counts.\n",
+ "\n",
+ "The tables are now on `main` and ready for consumption by our blog authors and analysts!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
+ "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "table_count = spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas().values[0][0]\n",
+ "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)\n",
+ "\n",
+ "table_count = spark.sql(\"select count(*) from dev_catalog.nba.totals_stats\").toPandas().values[0][0]\n",
+ "csv_count = spark.sql(\"select count(*) from stats_table\").toPandas().values[0][0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "Perform regular ETL on the new tables\n",
+ "-------------------\n",
+ "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n",
+ "\n",
+ "1. Update the salaries table to add new data\n",
+ "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n",
+ "3. We create a new table to hold information about the players appearances in all star games\n",
+ "\n",
+ "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"CREATE BRANCH etl IN dev_catalog FROM main\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.12"
+ },
+ "outputs": [],
+ "source": [
+ "# add some salaries for Kevin Durant\n",
+ "spark.sql(\"USE REFERENCE etl IN dev_catalog\")\n",
+ "spark.sql(\n",
+ " \"\"\"INSERT INTO dev_catalog.nba.salaries VALUES\n",
+ " (\"2017-18\", \"Golden State Warriors\", \"$25000000\", \"Kevin Durant\"),\n",
+ " (\"2018-19\", \"Golden State Warriors\", \"$30000000\", \"Kevin Durant\"),\n",
+ " (\"2019-20\", \"Brooklyn Nets\", \"$37199000\", \"Kevin Durant\"),\n",
+ " (\"2020-21\", \"Brooklyn Nets\", \"$39058950\", \"Kevin Durant\")\n",
+ " \"\"\"\n",
+ ").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Dropping a column in the `totals_stats` table\n",
+ "spark.sql(\"ALTER TABLE dev_catalog.nba.totals_stats DROP COLUMN Age\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating `allstar_games_stats` table and viewing the contents\n",
+ "spark.sql(\n",
+ " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.allstar_games_stats (\n",
+ " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING,\n",
+ " TOV STRING, PF STRING, PTS STRING, Player STRING)\n",
+ " USING iceberg\"\"\"\n",
+ ")\n",
+ "spark.sql(\n",
+ " \"\"\"CREATE OR REPLACE TEMPORARY VIEW allstar_table USING csv\n",
+ " OPTIONS (path \"../datasets/nba/allstar_games_stats.csv\", header true)\"\"\"\n",
+ ")\n",
+ "spark.sql(\"INSERT INTO dev_catalog.nba.allstar_games_stats SELECT * FROM allstar_table\").toPandas()\n",
+ "\n",
+ "# notice how we view the data on the etl branch via @etl\n",
+ "spark.sql(\"select count(*) from dev_catalog.nba.`allstar_games_stats@etl`\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can verify that the new table isn't on the `main` branch but is present on the etl branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
+ "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"USE REFERENCE etl IN dev_catalog\").toPandas()\n",
+ "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that we are happy with the data we can again merge it into `main`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "spark.sql(\"MERGE BRANCH etl INTO main IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now lets verify that the changes exist on the `main` branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
+ "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "table_count = spark.sql(\"select count(*) from dev_catalog.nba.allstar_games_stats\").toPandas().values[0][0]\n",
+ "csv_count = spark.sql(\"select count(*) from allstar_table\").toPandas().values[0][0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create `experiment` branch\n",
+ "--------------------------------\n",
+ "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n",
+ "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n",
+ "and carry out our experiment, which could consist of the following steps:\n",
+ "- drop `totals_stats` table\n",
+ "- add data to `salaries` table\n",
+ "- compare `experiment` and `main` tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"CREATE BRANCH experiment IN dev_catalog FROM main\").toPandas()\n",
+ "spark.sql(\"USE REFERENCE experiment IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop the `totals_stats` table on the `experiment` branch\n",
+ "spark.sql(\"DROP TABLE dev_catalog.nba.totals_stats\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add some salaries for Dirk Nowitzki\n",
+ "spark.sql(\n",
+ " \"\"\"INSERT INTO dev_catalog.nba.salaries VALUES\n",
+ " (\"2015-16\", \"Dallas Mavericks\", \"$8333333\", \"Dirk Nowitzki\"),\n",
+ " (\"2016-17\", \"Dallas Mavericks\", \"$25000000\", \"Dirk Nowitzki\"),\n",
+ " (\"2017-28\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\"),\n",
+ " (\"2018-19\", \"Dallas Mavericks\", \"$5000000\", \"Dirk Nowitzki\")\n",
+ " \"\"\"\n",
+ ").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n",
+ "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n",
+ "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "spark.sql(\"select count(*) from dev_catalog.nba.`salaries@experiment`\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n",
+ "to the `main` branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "And finally lets clean up after ourselves"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
}
+ },
+ "outputs": [],
+ "source": [
+ "spark.sql(\"DROP BRANCH dev IN dev_catalog\")\n",
+ "spark.sql(\"DROP BRANCH etl IN dev_catalog\")\n",
+ "spark.sql(\"DROP BRANCH experiment IN dev_catalog\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/nessie-iceberg-flink-demo-nba.ipynb b/notebooks/nessie-iceberg-flink-demo-nba.ipynb
index 16646608..c94d5660 100644
--- a/notebooks/nessie-iceberg-flink-demo-nba.ipynb
+++ b/notebooks/nessie-iceberg-flink-demo-nba.ipynb
@@ -1,2014 +1,763 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Nessie Iceberg/Flink SQL Demo with NBA Dataset\n",
- "============================\n",
- "This demo showcases how to use Nessie Python API along with Flink from Iceberg\n",
- "\n",
- "Initialize PyFlink\n",
- "----------------------------------------------\n",
- "To get started, we will first have to do a few setup steps that give us everything we need\n",
- "to get started with Nessie. In case you're interested in the detailed setup steps for Flink, you can check out the [docs](https://projectnessie.org/tools/iceberg/flink/)\n",
- "\n",
- "The Binder server has downloaded flink and some data for us as well as started a Nessie server in the background. All we have to do is start Flink\n",
- "\n",
- "The below cell starts a local Flink session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "SLF4J: Class path contains multiple SLF4J bindings.\n",
- "SLF4J: Found binding in [jar:file:/srv/conda/envs/flink-demo/lib/python3.7/site-packages/pyflink/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n",
- "SLF4J: Found binding in [jar:file:/srv/conda/envs/flink-demo/lib/python3.7/site-packages/pyflink/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n",
- "SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n",
- "SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "\n",
- "\n",
- "Flink running\n",
- "\n",
- "\n",
- "\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "from pyflink.datastream import StreamExecutionEnvironment\n",
- "from pyflink.table import StreamTableEnvironment\n",
- "from pyflink.table.expressions import lit\n",
- "from pynessie import init\n",
- "\n",
- "# where we will store our data\n",
- "warehouse = os.path.join(os.getcwd(), \"flink-warehouse\")\n",
- "# this was downloaded when Binder started, its available on maven central\n",
- "iceberg_flink_runtime_jar = os.path.join(os.getcwd(), \"../iceberg-flink-runtime-1.13-0.13.1.jar\")\n",
- "assert os.path.exists(iceberg_flink_runtime_jar)\n",
- "\n",
- "env = StreamExecutionEnvironment.get_execution_environment()\n",
- "env.add_jars(\"file://{}\".format(iceberg_flink_runtime_jar))\n",
- "table_env = StreamTableEnvironment.create(env)\n",
- "\n",
- "nessie_client = init()\n",
- "\n",
- "\n",
- "def create_ref_catalog(ref):\n",
- " \"\"\"\n",
- " Create a flink catalog that is tied to a specific ref.\n",
- "\n",
- " In order to create the catalog we have to first create the branch\n",
- " \"\"\"\n",
- " default_branch = nessie_client.get_default_branch()\n",
- " if ref != default_branch:\n",
- " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n",
- " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n",
- " # The important args below are:\n",
- " # type - tell Flink to use Iceberg as the catalog\n",
- " # catalog-impl - which Iceberg catalog to use, in this case we want Nessie\n",
- " # uri - the location of the nessie server.\n",
- " # ref - the Nessie ref/branch we want to use (defaults to main)\n",
- " # warehouse - the location this catalog should store its data\n",
- " table_env.execute_sql(\n",
- " f\"\"\"CREATE CATALOG {ref}_catalog WITH (\n",
- " 'type'='iceberg',\n",
- " 'catalog-impl'='org.apache.iceberg.nessie.NessieCatalog',\n",
- " 'uri'='http://localhost:19120/api/v1',\n",
- " 'ref'='{ref}',\n",
- " 'warehouse' = '{warehouse}')\"\"\"\n",
- " )\n",
- "\n",
- "\n",
- "create_ref_catalog(nessie_client.get_default_branch())\n",
- "print(\"\\n\\n\\nFlink running\\n\\n\\n\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Solving Data Engineering problems with Nessie\n",
- "============================\n",
- "\n",
- "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n",
- "\n",
- "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Set up Nessie branches (via Nessie CLI)\n",
- "----------------------------\n",
- "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n",
- "\n",
- "- Create a new branch named `dev`\n",
- "- List all branches\n",
- "\n",
- "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "create_ref_catalog(\"dev\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n",
- "\n",
- "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n",
- "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n",
- "\u001b[0m\n"
- ]
- }
- ],
- "source": [
- "!nessie --verbose branch"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create tables under dev branch\n",
- "-------------------------------------\n",
- "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n",
- "\n",
- "We create two tables under the `dev` branch:\n",
- "- `salaries`\n",
- "- `totals_stats`\n",
- "\n",
- "These tables list the salaries per player per year and their stats per year.\n",
- "\n",
- "To create the data we:\n",
- "\n",
- "1. switch our branch context to dev\n",
- "2. create the table\n",
- "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "WARNING: An illegal reflective access operation has occurred\n",
- "WARNING: Illegal reflective access by org.apache.hadoop.security.authentication.util.KerberosUtil (file:/srv/conda/envs/flink-demo/lib/python3.7/site-packages/pyflink/lib/hadoop-auth-2.10.1.jar) to method sun.security.krb5.Config.getInstance()\n",
- "WARNING: Please consider reporting this to the maintainers of org.apache.hadoop.security.authentication.util.KerberosUtil\n",
- "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
- "WARNING: All illegal access operations will be denied in a future release\n",
- "log4j:WARN No appenders could be found for logger (org.apache.htrace.core.Tracer).\n",
- "log4j:WARN Please initialize the log4j system properly.\n",
- "log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.\n"
- ]
- },
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:58,465 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:58,465 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:58,465 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:59,664 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:59,664 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:59,665 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:44:59,665 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "\n",
- "\n",
- "\n",
- "Added 51 rows to the salaries table and 93 rows to the totals_stats table.\n",
- "\n",
- "\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# Load the dataset\n",
- "from pyflink.table import DataTypes\n",
- "from pyflink.table.descriptors import Schema, OldCsv, FileSystem\n",
- "\n",
- "# Creating `salaries` table\n",
- "(\n",
- " table_env.connect(FileSystem().path(\"../datasets/nba/salaries.csv\"))\n",
- " .with_format(\n",
- " OldCsv()\n",
- " .field(\"Season\", DataTypes.STRING())\n",
- " .field(\"Team\", DataTypes.STRING())\n",
- " .field(\"Salary\", DataTypes.STRING())\n",
- " .field(\"Player\", DataTypes.STRING())\n",
- " )\n",
- " .with_schema(\n",
- " Schema()\n",
- " .field(\"Season\", DataTypes.STRING())\n",
- " .field(\"Team\", DataTypes.STRING())\n",
- " .field(\"Salary\", DataTypes.STRING())\n",
- " .field(\"Player\", DataTypes.STRING())\n",
- " )\n",
- " .create_temporary_table(\"dev_catalog.nba.salaries_temp\")\n",
- ")\n",
- "\n",
- "table_env.execute_sql(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.salaries\n",
- " (Season STRING, Team STRING, Salary STRING, Player STRING)\"\"\"\n",
- ").wait()\n",
- "\n",
- "tab = table_env.from_path(\"dev_catalog.nba.salaries_temp\")\n",
- "tab.execute_insert(\"dev_catalog.nba.salaries\").wait()\n",
- "\n",
- "# Creating `totals_stats` table\n",
- "(\n",
- " table_env.connect(FileSystem().path(\"../datasets/nba/totals_stats.csv\"))\n",
- " .with_format(\n",
- " OldCsv()\n",
- " .field(\"Season\", DataTypes.STRING())\n",
- " .field(\"Age\", DataTypes.STRING())\n",
- " .field(\"Team\", DataTypes.STRING())\n",
- " .field(\"ORB\", DataTypes.STRING())\n",
- " .field(\"DRB\", DataTypes.STRING())\n",
- " .field(\"TRB\", DataTypes.STRING())\n",
- " .field(\"AST\", DataTypes.STRING())\n",
- " .field(\"STL\", DataTypes.STRING())\n",
- " .field(\"BLK\", DataTypes.STRING())\n",
- " .field(\"TOV\", DataTypes.STRING())\n",
- " .field(\"PTS\", DataTypes.STRING())\n",
- " .field(\"Player\", DataTypes.STRING())\n",
- " .field(\"RSorPO\", DataTypes.STRING())\n",
- " )\n",
- " .with_schema(\n",
- " Schema()\n",
- " .field(\"Season\", DataTypes.STRING())\n",
- " .field(\"Age\", DataTypes.STRING())\n",
- " .field(\"Team\", DataTypes.STRING())\n",
- " .field(\"ORB\", DataTypes.STRING())\n",
- " .field(\"DRB\", DataTypes.STRING())\n",
- " .field(\"TRB\", DataTypes.STRING())\n",
- " .field(\"AST\", DataTypes.STRING())\n",
- " .field(\"STL\", DataTypes.STRING())\n",
- " .field(\"BLK\", DataTypes.STRING())\n",
- " .field(\"TOV\", DataTypes.STRING())\n",
- " .field(\"PTS\", DataTypes.STRING())\n",
- " .field(\"Player\", DataTypes.STRING())\n",
- " .field(\"RSorPO\", DataTypes.STRING())\n",
- " )\n",
- " .create_temporary_table(\"dev_catalog.nba.totals_stats_temp\")\n",
- ")\n",
- "\n",
- "table_env.execute_sql(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.totals_stats (Season STRING, Age STRING, Team STRING,\n",
- " ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING, PTS STRING,\n",
- " Player STRING, RSorPO STRING)\"\"\"\n",
- ").wait()\n",
- "\n",
- "tab = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\")\n",
- "tab.execute_insert(\"dev_catalog.nba.totals_stats\").wait()\n",
- "\n",
- "salaries = table_env.from_path(\"main_catalog.nba.`salaries@dev`\").select(lit(1).count).to_pandas().values[0][0]\n",
- "totals_stats = table_env.from_path(\"main_catalog.nba.`totals_stats@dev`\").select(lit(1).count).to_pandas().values[0][0]\n",
- "print(f\"\\n\\n\\nAdded {salaries} rows to the salaries table and {totals_stats} rows to the totals_stats table.\\n\\n\\n\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by the catalog."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2022-05-24 07:45:04,807 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:04,869 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:04,872 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:04,874 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:04,876 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:04,879 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:04,881 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:04,883 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "51\n",
- "2022-05-24 07:45:06,280 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:06,344 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:06,347 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:06,351 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:06,354 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:06,357 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:06,360 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:06,364 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "93\n"
- ]
- }
- ],
- "source": [
- "table_count = table_env.from_path(\"dev_catalog.nba.`salaries@dev`\").select(\"Season.count\").to_pandas().values[0][0]\n",
- "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(\"Season.count\").to_pandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)\n",
- "\n",
- "table_count = table_env.from_path(\"dev_catalog.nba.`totals_stats@dev`\").select(\"Season.count\").to_pandas().values[0][0]\n",
- "csv_count = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check generated tables\n",
- "----------------------------\n",
- "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n",
- "let's verify that the `main` branch was not altered by our changes."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "And on the `dev` branch we expect to see two tables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.totals_stats\n",
- "\tnba.salaries\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list --ref dev"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "We can also verify that the `dev` and `main` branches point to different commits"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " dev f48b93594ddead3a7616a271d657f0fff97cd0c4c04d4a579fa165aa96a69908\n",
- "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n",
- "\u001b[0m\n"
- ]
- }
- ],
- "source": [
- "!nessie --verbose branch"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Dev promotion into main\n",
- "-----------------------\n",
- "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n",
- "We merge `dev` into `main` via the command line `merge` command.\n",
- "Both branches should be at the same revision after merging/promotion."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie merge dev -b main --force"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "We can verify that the `main` branch now contains the expected tables and row counts.\n",
- "\n",
- "The tables are now on `main` and ready for consumption by our blog authors and analysts!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[33m* main facfd43be1d062734ca0cda5ae900dde398180bf3f370a19627da8a2419589b0\n",
- "\u001b[0m dev f48b93594ddead3a7616a271d657f0fff97cd0c4c04d4a579fa165aa96a69908\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie --verbose branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.totals_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2022-05-24 07:45:10,661 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:10,724 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:10,725 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:10,727 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:10,729 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:10,730 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:10,732 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:10,733 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:12,239 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:12,304 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:12,307 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:12,312 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:12,316 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:12,319 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:12,322 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:12,326 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n"
- ]
- }
- ],
- "source": [
- "table_count = table_env.from_path(\"main_catalog.nba.salaries\").select(\"Season.count\").to_pandas().values[0][0]\n",
- "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(\"Season.count\").to_pandas().values[0][0]\n",
- "assert table_count == csv_count\n",
- "\n",
- "table_count = table_env.from_path(\"main_catalog.nba.totals_stats\").select(\"Season.count\").to_pandas().values[0][0]\n",
- "csv_count = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n",
- "assert table_count == csv_count"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "Perform regular ETL on the new tables\n",
- "-------------------\n",
- "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n",
- "\n",
- "1. Update the salaries table to add new data\n",
- "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n",
- "3. We create a new table to hold information about the players appearances in all star games\n",
- "\n",
- "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "create_ref_catalog(\"etl\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2022-05-24 07:45:13,368 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n"
- ]
- }
- ],
- "source": [
- "# add some salaries for Kevin Durant\n",
- "table_env.execute_sql(\n",
- " \"\"\"INSERT INTO etl_catalog.nba.salaries\n",
- " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n",
- " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n",
- " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n",
- " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n",
- ").wait()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Rename the table `totals_stats` to `new_totals_stats`\n",
- "table_env.execute_sql(\"ALTER TABLE etl_catalog.nba.totals_stats RENAME TO etl_catalog.nba.new_totals_stats\").wait()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n",
- "2022-05-24 07:45:15,480 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:15,543 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:15,546 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:15,549 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:15,551 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:15,554 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:15,557 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:15,560 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n"
- ]
- },
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " Season | \n",
- " Age | \n",
- " Team | \n",
- " ORB | \n",
- " TRB | \n",
- " AST | \n",
- " STL | \n",
- " BLK | \n",
- " TOV | \n",
- " PF | \n",
- " PTS | \n",
- " Player | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 2004-05 | \n",
- " 26 | \n",
- " LAL | \n",
- " 3 | \n",
- " 6 | \n",
- " 7 | \n",
- " 3 | \n",
- " 1 | \n",
- " 4 | \n",
- " 5 | \n",
- " 16 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 2005-06 | \n",
- " 27 | \n",
- " LAL | \n",
- " 0 | \n",
- " 7 | \n",
- " 8 | \n",
- " 3 | \n",
- " 0 | \n",
- " 3 | \n",
- " 5 | \n",
- " 8 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 2006-07 | \n",
- " 28 | \n",
- " LAL | \n",
- " 1 | \n",
- " 5 | \n",
- " 6 | \n",
- " 6 | \n",
- " 0 | \n",
- " 4 | \n",
- " 1 | \n",
- " 31 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 2007-08 | \n",
- " 29 | \n",
- " LAL | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 2008-09 | \n",
- " 30 | \n",
- " LAL | \n",
- " 1 | \n",
- " 4 | \n",
- " 4 | \n",
- " 4 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 27 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 2009-10 | \n",
- " 31 | \n",
- " LAL | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 6 | \n",
- " 2009-10 | \n",
- " 25 | \n",
- " CLE | \n",
- " 1 | \n",
- " 5 | \n",
- " 6 | \n",
- " 4 | \n",
- " 0 | \n",
- " 2 | \n",
- " 1 | \n",
- " 25 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 2010-11 | \n",
- " 26 | \n",
- " MIA | \n",
- " 2 | \n",
- " 12 | \n",
- " 10 | \n",
- " 0 | \n",
- " 0 | \n",
- " 4 | \n",
- " 3 | \n",
- " 29 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 8 | \n",
- " 2011-12 | \n",
- " 27 | \n",
- " MIA | \n",
- " 0 | \n",
- " 6 | \n",
- " 7 | \n",
- " 0 | \n",
- " 0 | \n",
- " 4 | \n",
- " 2 | \n",
- " 36 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 9 | \n",
- " 2012-13 | \n",
- " 28 | \n",
- " MIA | \n",
- " 0 | \n",
- " 3 | \n",
- " 5 | \n",
- " 1 | \n",
- " 0 | \n",
- " 4 | \n",
- " 0 | \n",
- " 19 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 10 | \n",
- " 2013-14 | \n",
- " 29 | \n",
- " MIA | \n",
- " 1 | \n",
- " 7 | \n",
- " 7 | \n",
- " 3 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " 22 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 11 | \n",
- " 2014-15 | \n",
- " 30 | \n",
- " CLE | \n",
- " 1 | \n",
- " 5 | \n",
- " 7 | \n",
- " 2 | \n",
- " 0 | \n",
- " 4 | \n",
- " 1 | \n",
- " 30 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 12 | \n",
- " 2010-11 | \n",
- " 32 | \n",
- " LAL | \n",
- " 10 | \n",
- " 14 | \n",
- " 3 | \n",
- " 3 | \n",
- " 0 | \n",
- " 4 | \n",
- " 2 | \n",
- " 37 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 13 | \n",
- " 2011-12 | \n",
- " 33 | \n",
- " LAL | \n",
- " 0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 2 | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- " 27 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 14 | \n",
- " 2012-13 | \n",
- " 34 | \n",
- " LAL | \n",
- " 2 | \n",
- " 4 | \n",
- " 8 | \n",
- " 2 | \n",
- " 2 | \n",
- " 1 | \n",
- " 2 | \n",
- " 9 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 15 | \n",
- " 2013-14 | \n",
- " 35 | \n",
- " LAL | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 16 | \n",
- " 2014-15 | \n",
- " 36 | \n",
- " LAL | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 17 | \n",
- " 2015-16 | \n",
- " 37 | \n",
- " LAL | \n",
- " 1 | \n",
- " 6 | \n",
- " 7 | \n",
- " 1 | \n",
- " 0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 10 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 18 | \n",
- " 1997-98 | \n",
- " 19 | \n",
- " LAL | \n",
- " 2 | \n",
- " 6 | \n",
- " 1 | \n",
- " 2 | \n",
- " 0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 18 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 19 | \n",
- " 1999-00 | \n",
- " 21 | \n",
- " LAL | \n",
- " 1 | \n",
- " 1 | \n",
- " 3 | \n",
- " 2 | \n",
- " 0 | \n",
- " 1 | \n",
- " 3 | \n",
- " 15 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 20 | \n",
- " 2000-01 | \n",
- " 22 | \n",
- " LAL | \n",
- " 2 | \n",
- " 4 | \n",
- " 7 | \n",
- " 1 | \n",
- " 0 | \n",
- " 3 | \n",
- " 3 | \n",
- " 19 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 21 | \n",
- " 2001-02 | \n",
- " 23 | \n",
- " LAL | \n",
- " 2 | \n",
- " 5 | \n",
- " 5 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 2 | \n",
- " 31 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 22 | \n",
- " 2002-03 | \n",
- " 24 | \n",
- " LAL | \n",
- " 2 | \n",
- " 7 | \n",
- " 6 | \n",
- " 3 | \n",
- " 2 | \n",
- " 5 | \n",
- " 5 | \n",
- " 22 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 23 | \n",
- " 2003-04 | \n",
- " 25 | \n",
- " LAL | \n",
- " 1 | \n",
- " 4 | \n",
- " 4 | \n",
- " 5 | \n",
- " 1 | \n",
- " 6 | \n",
- " 3 | \n",
- " 20 | \n",
- " Kobe Bryant | \n",
- "
\n",
- " \n",
- " 24 | \n",
- " Season | \n",
- " Age | \n",
- " Team | \n",
- " ORB | \n",
- " TRB | \n",
- " AST | \n",
- " STL | \n",
- " BLK | \n",
- " TOV | \n",
- " PF | \n",
- " PTS | \n",
- " Player | \n",
- "
\n",
- " \n",
- " 25 | \n",
- " 2004-05 | \n",
- " 20 | \n",
- " CLE | \n",
- " 1 | \n",
- " 8 | \n",
- " 6 | \n",
- " 2 | \n",
- " 0 | \n",
- " 3 | \n",
- " 0 | \n",
- " 13 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 26 | \n",
- " 2005-06 | \n",
- " 21 | \n",
- " CLE | \n",
- " 2 | \n",
- " 6 | \n",
- " 2 | \n",
- " 2 | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- " 29 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 27 | \n",
- " 2006-07 | \n",
- " 22 | \n",
- " CLE | \n",
- " 0 | \n",
- " 6 | \n",
- " 6 | \n",
- " 1 | \n",
- " 0 | \n",
- " 4 | \n",
- " 0 | \n",
- " 28 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 28 | \n",
- " 2007-08 | \n",
- " 23 | \n",
- " CLE | \n",
- " 1 | \n",
- " 8 | \n",
- " 9 | \n",
- " 2 | \n",
- " 2 | \n",
- " 4 | \n",
- " 3 | \n",
- " 27 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 29 | \n",
- " 2008-09 | \n",
- " 24 | \n",
- " CLE | \n",
- " 0 | \n",
- " 5 | \n",
- " 3 | \n",
- " 0 | \n",
- " 0 | \n",
- " 3 | \n",
- " 0 | \n",
- " 20 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 30 | \n",
- " 1992-93 | \n",
- " 29 | \n",
- " CHI | \n",
- " 3 | \n",
- " 4 | \n",
- " 5 | \n",
- " 4 | \n",
- " 0 | \n",
- " 6 | \n",
- " 5 | \n",
- " 30 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 31 | \n",
- " 1995-96 | \n",
- " 32 | \n",
- " CHI | \n",
- " 1 | \n",
- " 4 | \n",
- " 1 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " 20 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 32 | \n",
- " 1996-97 | \n",
- " 33 | \n",
- " CHI | \n",
- " 3 | \n",
- " 11 | \n",
- " 11 | \n",
- " 2 | \n",
- " 0 | \n",
- " 3 | \n",
- " 4 | \n",
- " 14 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 33 | \n",
- " 1997-98 | \n",
- " 34 | \n",
- " CHI | \n",
- " 1 | \n",
- " 6 | \n",
- " 8 | \n",
- " 3 | \n",
- " 0 | \n",
- " 2 | \n",
- " 0 | \n",
- " 23 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 34 | \n",
- " 2001-02 | \n",
- " 38 | \n",
- " WAS | \n",
- " 0 | \n",
- " 4 | \n",
- " 3 | \n",
- " 2 | \n",
- " 0 | \n",
- " 1 | \n",
- " 1 | \n",
- " 8 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 35 | \n",
- " 2002-03 | \n",
- " 39 | \n",
- " WAS | \n",
- " 2 | \n",
- " 5 | \n",
- " 2 | \n",
- " 2 | \n",
- " 0 | \n",
- " 2 | \n",
- " 3 | \n",
- " 20 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 36 | \n",
- " 2015-16 | \n",
- " 31 | \n",
- " CLE | \n",
- " 0 | \n",
- " 4 | \n",
- " 7 | \n",
- " 0 | \n",
- " 0 | \n",
- " 4 | \n",
- " 0 | \n",
- " 13 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 37 | \n",
- " 2016-17 | \n",
- " 32 | \n",
- " CLE | \n",
- " 0 | \n",
- " 3 | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 4 | \n",
- " 2 | \n",
- " 23 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 38 | \n",
- " 2017-18 | \n",
- " 33 | \n",
- " CLE | \n",
- " 0 | \n",
- " 10 | \n",
- " 8 | \n",
- " 1 | \n",
- " 0 | \n",
- " 5 | \n",
- " 2 | \n",
- " 29 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 39 | \n",
- " 2018-19 | \n",
- " 34 | \n",
- " LAL | \n",
- " 2 | \n",
- " 8 | \n",
- " 4 | \n",
- " 0 | \n",
- " 2 | \n",
- " 1 | \n",
- " 1 | \n",
- " 19 | \n",
- " Lebron James | \n",
- "
\n",
- " \n",
- " 40 | \n",
- " 1984-85 | \n",
- " 21 | \n",
- " CHI | \n",
- " 3 | \n",
- " 6 | \n",
- " 2 | \n",
- " 3 | \n",
- " 1 | \n",
- " 1 | \n",
- " 4 | \n",
- " 7 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 41 | \n",
- " 1985-86 | \n",
- " 22 | \n",
- " CHI | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 42 | \n",
- " 1986-87 | \n",
- " 23 | \n",
- " CHI | \n",
- " 0 | \n",
- " 0 | \n",
- " 4 | \n",
- " 2 | \n",
- " 0 | \n",
- " 5 | \n",
- " 2 | \n",
- " 11 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 43 | \n",
- " 1987-88 | \n",
- " 24 | \n",
- " CHI | \n",
- " 3 | \n",
- " 8 | \n",
- " 3 | \n",
- " 4 | \n",
- " 4 | \n",
- " 2 | \n",
- " 5 | \n",
- " 40 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 44 | \n",
- " 1988-89 | \n",
- " 25 | \n",
- " CHI | \n",
- " 1 | \n",
- " 2 | \n",
- " 3 | \n",
- " 5 | \n",
- " 0 | \n",
- " 4 | \n",
- " 1 | \n",
- " 28 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 45 | \n",
- " 1989-90 | \n",
- " 26 | \n",
- " CHI | \n",
- " 1 | \n",
- " 5 | \n",
- " 2 | \n",
- " 5 | \n",
- " 1 | \n",
- " 5 | \n",
- " 1 | \n",
- " 17 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 46 | \n",
- " 1990-91 | \n",
- " 27 | \n",
- " CHI | \n",
- " 3 | \n",
- " 5 | \n",
- " 5 | \n",
- " 2 | \n",
- " 0 | \n",
- " 10 | \n",
- " 2 | \n",
- " 26 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- " 47 | \n",
- " 1991-92 | \n",
- " 28 | \n",
- " CHI | \n",
- " 1 | \n",
- " 1 | \n",
- " 5 | \n",
- " 2 | \n",
- " 0 | \n",
- " 1 | \n",
- " 2 | \n",
- " 18 | \n",
- " Michael Jordan | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " Season Age Team ORB TRB AST STL BLK TOV PF PTS Player\n",
- "0 2004-05 26 LAL 3 6 7 3 1 4 5 16 Kobe Bryant\n",
- "1 2005-06 27 LAL 0 7 8 3 0 3 5 8 Kobe Bryant\n",
- "2 2006-07 28 LAL 1 5 6 6 0 4 1 31 Kobe Bryant\n",
- "3 2007-08 29 LAL 0 1 0 0 0 0 0 0 Kobe Bryant\n",
- "4 2008-09 30 LAL 1 4 4 4 0 1 0 27 Kobe Bryant\n",
- "5 2009-10 31 LAL Kobe Bryant\n",
- "6 2009-10 25 CLE 1 5 6 4 0 2 1 25 Lebron James\n",
- "7 2010-11 26 MIA 2 12 10 0 0 4 3 29 Lebron James\n",
- "8 2011-12 27 MIA 0 6 7 0 0 4 2 36 Lebron James\n",
- "9 2012-13 28 MIA 0 3 5 1 0 4 0 19 Lebron James\n",
- "10 2013-14 29 MIA 1 7 7 3 0 1 0 22 Lebron James\n",
- "11 2014-15 30 CLE 1 5 7 2 0 4 1 30 Lebron James\n",
- "12 2010-11 32 LAL 10 14 3 3 0 4 2 37 Kobe Bryant\n",
- "13 2011-12 33 LAL 0 1 1 2 0 1 2 27 Kobe Bryant\n",
- "14 2012-13 34 LAL 2 4 8 2 2 1 2 9 Kobe Bryant\n",
- "15 2013-14 35 LAL Kobe Bryant\n",
- "16 2014-15 36 LAL Kobe Bryant\n",
- "17 2015-16 37 LAL 1 6 7 1 0 1 1 10 Kobe Bryant\n",
- "18 1997-98 19 LAL 2 6 1 2 0 1 1 18 Kobe Bryant\n",
- "19 1999-00 21 LAL 1 1 3 2 0 1 3 15 Kobe Bryant\n",
- "20 2000-01 22 LAL 2 4 7 1 0 3 3 19 Kobe Bryant\n",
- "21 2001-02 23 LAL 2 5 5 1 0 0 2 31 Kobe Bryant\n",
- "22 2002-03 24 LAL 2 7 6 3 2 5 5 22 Kobe Bryant\n",
- "23 2003-04 25 LAL 1 4 4 5 1 6 3 20 Kobe Bryant\n",
- "24 Season Age Team ORB TRB AST STL BLK TOV PF PTS Player\n",
- "25 2004-05 20 CLE 1 8 6 2 0 3 0 13 Lebron James\n",
- "26 2005-06 21 CLE 2 6 2 2 0 1 2 29 Lebron James\n",
- "27 2006-07 22 CLE 0 6 6 1 0 4 0 28 Lebron James\n",
- "28 2007-08 23 CLE 1 8 9 2 2 4 3 27 Lebron James\n",
- "29 2008-09 24 CLE 0 5 3 0 0 3 0 20 Lebron James\n",
- "30 1992-93 29 CHI 3 4 5 4 0 6 5 30 Michael Jordan\n",
- "31 1995-96 32 CHI 1 4 1 1 0 0 1 20 Michael Jordan\n",
- "32 1996-97 33 CHI 3 11 11 2 0 3 4 14 Michael Jordan\n",
- "33 1997-98 34 CHI 1 6 8 3 0 2 0 23 Michael Jordan\n",
- "34 2001-02 38 WAS 0 4 3 2 0 1 1 8 Michael Jordan\n",
- "35 2002-03 39 WAS 2 5 2 2 0 2 3 20 Michael Jordan\n",
- "36 2015-16 31 CLE 0 4 7 0 0 4 0 13 Lebron James\n",
- "37 2016-17 32 CLE 0 3 1 0 0 4 2 23 Lebron James\n",
- "38 2017-18 33 CLE 0 10 8 1 0 5 2 29 Lebron James\n",
- "39 2018-19 34 LAL 2 8 4 0 2 1 1 19 Lebron James\n",
- "40 1984-85 21 CHI 3 6 2 3 1 1 4 7 Michael Jordan\n",
- "41 1985-86 22 CHI Michael Jordan\n",
- "42 1986-87 23 CHI 0 0 4 2 0 5 2 11 Michael Jordan\n",
- "43 1987-88 24 CHI 3 8 3 4 4 2 5 40 Michael Jordan\n",
- "44 1988-89 25 CHI 1 2 3 5 0 4 1 28 Michael Jordan\n",
- "45 1989-90 26 CHI 1 5 2 5 1 5 1 17 Michael Jordan\n",
- "46 1990-91 27 CHI 3 5 5 2 0 10 2 26 Michael Jordan\n",
- "47 1991-92 28 CHI 1 1 5 2 0 1 2 18 Michael Jordan"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Creating `allstar_games_stats` table\n",
- "(\n",
- " table_env.connect(FileSystem().path(\"../datasets/nba/allstar_games_stats.csv\"))\n",
- " .with_format(\n",
- " OldCsv()\n",
- " .field(\"Season\", DataTypes.STRING())\n",
- " .field(\"Age\", DataTypes.STRING())\n",
- " .field(\"Team\", DataTypes.STRING())\n",
- " .field(\"ORB\", DataTypes.STRING())\n",
- " .field(\"TRB\", DataTypes.STRING())\n",
- " .field(\"AST\", DataTypes.STRING())\n",
- " .field(\"STL\", DataTypes.STRING())\n",
- " .field(\"BLK\", DataTypes.STRING())\n",
- " .field(\"TOV\", DataTypes.STRING())\n",
- " .field(\"PF\", DataTypes.STRING())\n",
- " .field(\"PTS\", DataTypes.STRING())\n",
- " .field(\"Player\", DataTypes.STRING())\n",
- " )\n",
- " .with_schema(\n",
- " Schema()\n",
- " .field(\"Season\", DataTypes.STRING())\n",
- " .field(\"Age\", DataTypes.STRING())\n",
- " .field(\"Team\", DataTypes.STRING())\n",
- " .field(\"ORB\", DataTypes.STRING())\n",
- " .field(\"TRB\", DataTypes.STRING())\n",
- " .field(\"AST\", DataTypes.STRING())\n",
- " .field(\"STL\", DataTypes.STRING())\n",
- " .field(\"BLK\", DataTypes.STRING())\n",
- " .field(\"TOV\", DataTypes.STRING())\n",
- " .field(\"PF\", DataTypes.STRING())\n",
- " .field(\"PTS\", DataTypes.STRING())\n",
- " .field(\"Player\", DataTypes.STRING())\n",
- " )\n",
- " .create_temporary_table(\"etl_catalog.nba.allstar_games_stats_temp\")\n",
- ")\n",
- "\n",
- "table_env.execute_sql(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS etl_catalog.nba.allstar_games_stats (Season STRING, Age STRING,\n",
- " Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING,\n",
- " PF STRING, PTS STRING, Player STRING)\"\"\"\n",
- ").wait()\n",
- "\n",
- "tab = table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\")\n",
- "tab.execute_insert(\"etl_catalog.nba.allstar_games_stats\").wait()\n",
- "\n",
- "# Notice how we view the data on the etl branch via @etl\n",
- "table_env.from_path(\"etl_catalog.nba.`allstar_games_stats@etl`\").to_pandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can verify that the new table isn't on the `main` branch but is present on the etl branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.totals_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n",
- "!nessie content list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.allstar_games_stats\n",
- "\tnba.new_totals_stats\n",
- "\tnba.salaries\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# We should see `allstar_games_stats` and the `new_totals_stats` on the `etl` branch\n",
- "!nessie content list --ref etl"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that we are happy with the data we can again merge it into `main`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie merge etl -b main --force"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "Now lets verify that the changes exist on the `main` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.new_totals_stats\n",
- "\tnba.allstar_games_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[33m* main 720543fa3a9579d0bfee11e07f383d86468eb4d73dc207e5bd6ef7f76b000930\n",
- "\u001b[0m etl c962d80b04ee619a6a0670cb5f664d948c86f6ebf66435027c5abe761e920c9e\n",
- " dev f48b93594ddead3a7616a271d657f0fff97cd0c4c04d4a579fa165aa96a69908\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie --verbose branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2022-05-24 07:45:19,196 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:19,257 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:19,260 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:19,263 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:19,265 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:19,268 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:19,270 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n",
- "2022-05-24 07:45:19,273 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n"
- ]
- }
- ],
- "source": [
- "table_count = (\n",
- " table_env.from_path(\"main_catalog.nba.allstar_games_stats\").select(\"Season.count\").to_pandas().values[0][0]\n",
- ")\n",
- "csv_count = (\n",
- " table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n",
- ")\n",
- "assert table_count == csv_count"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create `experiment` branch\n",
- "--------------------------------\n",
- "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n",
- "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n",
- "and carry out our experiment, which could consist of the following steps:\n",
- "- drop `totals_stats` table\n",
- "- add data to `salaries` table\n",
- "- compare `experiment` and `main` tables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "create_ref_catalog(\"experiment\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- ""
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Drop the `new_totals_stats` table on the `experiment` branch\n",
- "table_env.execute_sql(\"DROP TABLE experiment_catalog.nba.new_totals_stats\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2022-05-24 07:45:20,258 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n"
- ]
- }
- ],
- "source": [
- "# add some salaries for Dirk Nowitzki\n",
- "table_env.execute_sql(\n",
- " \"\"\"INSERT INTO experiment_catalog.nba.salaries VALUES\n",
- " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n",
- " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n",
- " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n",
- " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n",
- ").wait()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.allstar_games_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `new_totals_stats`)\n",
- "!nessie content list --ref experiment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.new_totals_stats\n",
- "\tnba.allstar_games_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# `main` hasn't changed been changed and still has the `new_totals_stats` table\n",
- "!nessie content list"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n",
- "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " EXPR$0 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 59 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " EXPR$0\n",
- "0 59"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "table_env.from_path(\"main_catalog.nba.`salaries@experiment`\").select(lit(1).count).to_pandas()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n",
- "to the `main` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " EXPR$0 | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 55 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " EXPR$0\n",
- "0 55"
- ]
- },
- "execution_count": null,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "table_env.from_path(\"main_catalog.nba.`salaries@main`\").select(lit(1).count).to_pandas()"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Nessie Iceberg/Flink SQL Demo with NBA Dataset\n",
+ "============================\n",
+ "This demo showcases how to use Nessie Python API along with Flink from Iceberg\n",
+ "\n",
+ "Initialize PyFlink\n",
+ "----------------------------------------------\n",
+ "To get started, we will first have to do a few setup steps that give us everything we need\n",
+ "to get started with Nessie. In case you're interested in the detailed setup steps for Flink, you can check out the [docs](https://projectnessie.org/tools/iceberg/flink/)\n",
+ "\n",
+ "The Binder server has downloaded flink and some data for us as well as started a Nessie server in the background. All we have to do is start Flink\n",
+ "\n",
+ "The below cell starts a local Flink session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from pyflink.datastream import StreamExecutionEnvironment\n",
+ "from pyflink.table import StreamTableEnvironment\n",
+ "from pyflink.table.expressions import lit\n",
+ "from pynessie import init\n",
+ "\n",
+ "# where we will store our data\n",
+ "warehouse = os.path.join(os.getcwd(), \"flink-warehouse\")\n",
+ "# this was downloaded when Binder started, its available on maven central\n",
+ "iceberg_flink_runtime_jar = os.path.join(os.getcwd(), \"../iceberg-flink-runtime-1.17-1.4.2.jar\")\n",
+ "assert os.path.exists(iceberg_flink_runtime_jar)\n",
+ "\n",
+ "env = StreamExecutionEnvironment.get_execution_environment()\n",
+ "env.add_jars(\"file://{}\".format(iceberg_flink_runtime_jar))\n",
+ "table_env = StreamTableEnvironment.create(env)\n",
+ "\n",
+ "nessie_client = init()\n",
+ "\n",
+ "\n",
+ "def create_ref_catalog(ref):\n",
+ " \"\"\"\n",
+ " Create a flink catalog that is tied to a specific ref.\n",
+ "\n",
+ " In order to create the catalog we have to first create the branch\n",
+ " \"\"\"\n",
+ " default_branch = nessie_client.get_default_branch()\n",
+ " if ref != default_branch:\n",
+ " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n",
+ " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n",
+ " # The important args below are:\n",
+ " # type - tell Flink to use Iceberg as the catalog\n",
+ " # catalog-impl - which Iceberg catalog to use, in this case we want Nessie\n",
+ " # uri - the location of the nessie server.\n",
+ " # ref - the Nessie ref/branch we want to use (defaults to main)\n",
+ " # warehouse - the location this catalog should store its data\n",
+ " table_env.execute_sql(\n",
+ " f\"\"\"CREATE CATALOG {ref}_catalog WITH (\n",
+ " 'type'='iceberg',\n",
+ " 'catalog-impl'='org.apache.iceberg.nessie.NessieCatalog',\n",
+ " 'uri'='http://localhost:19120/api/v1',\n",
+ " 'ref'='{ref}',\n",
+ " 'warehouse' = '{warehouse}')\"\"\"\n",
+ " )\n",
+ "\n",
+ "\n",
+ "create_ref_catalog(nessie_client.get_default_branch())\n",
+ "print(\"\\n\\n\\nFlink running\\n\\n\\n\")\n",
+ "\n",
+ "# Create the 'nba' namespace.\n",
+ "table_env.execute_sql(\"CREATE DATABASE main_catalog.nba\").wait()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Solving Data Engineering problems with Nessie\n",
+ "============================\n",
+ "\n",
+ "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n",
+ "\n",
+ "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Set up Nessie branches (via Nessie CLI)\n",
+ "----------------------------\n",
+ "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n",
+ "\n",
+ "- Create a new branch named `dev`\n",
+ "- List all branches\n",
+ "\n",
+ "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "create_ref_catalog(\"dev\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n",
+ "\n",
+ "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie --verbose branch"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create tables under dev branch\n",
+ "-------------------------------------\n",
+ "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n",
+ "\n",
+ "We create two tables under the `dev` branch:\n",
+ "- `salaries`\n",
+ "- `totals_stats`\n",
+ "\n",
+ "These tables list the salaries per player per year and their stats per year.\n",
+ "\n",
+ "To create the data we:\n",
+ "\n",
+ "1. switch our branch context to dev\n",
+ "2. create the table\n",
+ "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Load the dataset\n",
+ "from pyflink.table import DataTypes, Schema, TableDescriptor\n",
+ "from pyflink.table.expressions import col\n",
+ "\n",
+ "# Creating `salaries` table\n",
+ "(\n",
+ " table_env.create_temporary_table(\n",
+ " \"dev_catalog.nba.salaries_temp\",\n",
+ " TableDescriptor.for_connector(\"filesystem\")\n",
+ " .schema(\n",
+ " Schema.new_builder()\n",
+ " .column(\"Season\", DataTypes.STRING())\n",
+ " .column(\"Team\", DataTypes.STRING())\n",
+ " .column(\"Salary\", DataTypes.STRING())\n",
+ " .column(\"Player\", DataTypes.STRING())\n",
+ " .build()\n",
+ " )\n",
+ " .option(\"path\", \"../datasets/nba/salaries.csv\")\n",
+ " .format(\"csv\")\n",
+ " .build(),\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "table_env.execute_sql(\n",
+ " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.`salaries@dev`\n",
+ " (Season STRING, Team STRING, Salary STRING, Player STRING)\"\"\"\n",
+ ").wait()\n",
+ "\n",
+ "tab = table_env.from_path(\"dev_catalog.nba.salaries_temp\")\n",
+ "tab.execute_insert(\"dev_catalog.nba.`salaries@dev`\").wait()\n",
+ "\n",
+ "# Creating `totals_stats` table\n",
+ "(\n",
+ " table_env.create_temporary_table(\n",
+ " \"dev_catalog.nba.totals_stats_temp\",\n",
+ " TableDescriptor.for_connector(\"filesystem\")\n",
+ " .schema(\n",
+ " Schema.new_builder()\n",
+ " .column(\"Season\", DataTypes.STRING())\n",
+ " .column(\"Age\", DataTypes.STRING())\n",
+ " .column(\"Team\", DataTypes.STRING())\n",
+ " .column(\"ORB\", DataTypes.STRING())\n",
+ " .column(\"DRB\", DataTypes.STRING())\n",
+ " .column(\"TRB\", DataTypes.STRING())\n",
+ " .column(\"AST\", DataTypes.STRING())\n",
+ " .column(\"STL\", DataTypes.STRING())\n",
+ " .column(\"BLK\", DataTypes.STRING())\n",
+ " .column(\"TOV\", DataTypes.STRING())\n",
+ " .column(\"PTS\", DataTypes.STRING())\n",
+ " .column(\"Player\", DataTypes.STRING())\n",
+ " .column(\"RSorPO\", DataTypes.STRING())\n",
+ " .build()\n",
+ " )\n",
+ " .option(\"path\", \"../datasets/nba/totals_stats.csv\")\n",
+ " .format(\"csv\")\n",
+ " .build(),\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "table_env.execute_sql(\n",
+ " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.`totals_stats@dev` (Season STRING, Age STRING, Team STRING,\n",
+ " ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING, PTS STRING,\n",
+ " Player STRING, RSorPO STRING)\"\"\"\n",
+ ").wait()\n",
+ "\n",
+ "tab = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\")\n",
+ "tab.execute_insert(\"dev_catalog.nba.`totals_stats@dev`\").wait()\n",
+ "\n",
+ "salaries = table_env.from_path(\"dev_catalog.nba.`salaries@dev`\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ "totals_stats = (\n",
+ " table_env.from_path(\"dev_catalog.nba.`totals_stats@dev`\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ ")\n",
+ "print(f\"\\n\\n\\nAdded {salaries} rows to the salaries table and {totals_stats} rows to the totals_stats table.\\n\\n\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by the catalog."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "table_count = table_env.from_path(\"dev_catalog.nba.`salaries@dev`\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)\n",
+ "\n",
+ "table_count = (\n",
+ " table_env.from_path(\"dev_catalog.nba.`totals_stats@dev`\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ ")\n",
+ "csv_count = (\n",
+ " table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ ")\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Check generated tables\n",
+ "----------------------------\n",
+ "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n",
+ "let's verify that the `main` branch was not altered by our changes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "And on the `dev` branch we expect to see two tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nessie content list --ref dev"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "We can also verify that the `dev` and `main` branches point to different commits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nessie --verbose branch"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Dev promotion into main\n",
+ "-----------------------\n",
+ "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n",
+ "We merge `dev` into `main` via the command line `merge` command.\n",
+ "Both branches should be at the same revision after merging/promotion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nessie merge dev -b main --force"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "We can verify that the `main` branch now contains the expected tables and row counts.\n",
+ "\n",
+ "The tables are now on `main` and ready for consumption by our blog authors and analysts!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nessie --verbose branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "flink-demo",
- "language": "python",
- "name": "flink-demo"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.12"
+ },
+ "outputs": [],
+ "source": [
+ "table_count = table_env.from_path(\"main_catalog.nba.salaries\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ "assert table_count == csv_count\n",
+ "\n",
+ "table_count = table_env.from_path(\"main_catalog.nba.totals_stats\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ "csv_count = (\n",
+ " table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ ")\n",
+ "assert table_count == csv_count"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "Perform regular ETL on the new tables\n",
+ "-------------------\n",
+ "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n",
+ "\n",
+ "1. Update the salaries table to add new data\n",
+ "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n",
+ "3. We create a new table to hold information about the players appearances in all star games\n",
+ "\n",
+ "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "create_ref_catalog(\"etl\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# add some salaries for Kevin Durant\n",
+ "table_env.execute_sql(\n",
+ " \"\"\"INSERT INTO etl_catalog.nba.salaries\n",
+ " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n",
+ " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n",
+ " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n",
+ " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n",
+ ").wait()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Rename the table `totals_stats` to `new_totals_stats`\n",
+ "table_env.execute_sql(\n",
+ " \"ALTER TABLE etl_catalog.nba.`totals_stats@etl` RENAME TO etl_catalog.nba.new_totals_stats\"\n",
+ ").wait()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating `allstar_games_stats` table\n",
+ "(\n",
+ " table_env.create_temporary_table(\n",
+ " \"etl_catalog.nba.allstar_games_stats_temp\",\n",
+ " TableDescriptor.for_connector(\"filesystem\")\n",
+ " .schema(\n",
+ " Schema.new_builder()\n",
+ " .column(\"Season\", DataTypes.STRING())\n",
+ " .column(\"Age\", DataTypes.STRING())\n",
+ " .column(\"Team\", DataTypes.STRING())\n",
+ " .column(\"ORB\", DataTypes.STRING())\n",
+ " .column(\"TRB\", DataTypes.STRING())\n",
+ " .column(\"AST\", DataTypes.STRING())\n",
+ " .column(\"STL\", DataTypes.STRING())\n",
+ " .column(\"BLK\", DataTypes.STRING())\n",
+ " .column(\"TOV\", DataTypes.STRING())\n",
+ " .column(\"PF\", DataTypes.STRING())\n",
+ " .column(\"PTS\", DataTypes.STRING())\n",
+ " .column(\"Player\", DataTypes.STRING())\n",
+ " .build()\n",
+ " )\n",
+ " .option(\"path\", \"../datasets/nba/allstar_games_stats.csv\")\n",
+ " .format(\"csv\")\n",
+ " .build(),\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "table_env.execute_sql(\n",
+ " \"\"\"CREATE TABLE IF NOT EXISTS etl_catalog.nba.`allstar_games_stats@etl` (Season STRING, Age STRING,\n",
+ " Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING,\n",
+ " PF STRING, PTS STRING, Player STRING)\"\"\"\n",
+ ").wait()\n",
+ "\n",
+ "tab = table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\")\n",
+ "tab.execute_insert(\"etl_catalog.nba.`allstar_games_stats@etl`\").wait()\n",
+ "\n",
+ "# Notice how we view the data on the etl branch via @etl\n",
+ "table_env.from_path(\"etl_catalog.nba.`allstar_games_stats@etl`\").to_pandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can verify that the new table isn't on the `main` branch but is present on the etl branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n",
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# We should see `allstar_games_stats` and the `new_totals_stats` on the `etl` branch\n",
+ "!nessie content list --ref etl"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that we are happy with the data we can again merge it into `main`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie merge etl -b main --force"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "Now lets verify that the changes exist on the `main` branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie --verbose branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "table_count = (\n",
+ " table_env.from_path(\"main_catalog.nba.allstar_games_stats\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ ")\n",
+ "csv_count = (\n",
+ " table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\").select(col(\"Season\").count).to_pandas().values[0][0]\n",
+ ")\n",
+ "assert table_count == csv_count"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create `experiment` branch\n",
+ "--------------------------------\n",
+ "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n",
+ "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n",
+ "and carry out our experiment, which could consist of the following steps:\n",
+ "- drop `totals_stats` table\n",
+ "- add data to `salaries` table\n",
+ "- compare `experiment` and `main` tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "create_ref_catalog(\"experiment\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop the `new_totals_stats` table on the `experiment` branch\n",
+ "table_env.execute_sql(\"DROP TABLE experiment_catalog.nba.`new_totals_stats@etl`\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add some salaries for Dirk Nowitzki\n",
+ "table_env.execute_sql(\n",
+ " \"\"\"INSERT INTO experiment_catalog.nba.salaries VALUES\n",
+ " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n",
+ " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n",
+ " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n",
+ " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n",
+ ").wait()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `new_totals_stats`)\n",
+ "!nessie content list --ref experiment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# `main` hasn't changed been changed and still has the `new_totals_stats` table\n",
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n",
+ "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "table_env.from_path(\"main_catalog.nba.`salaries@experiment`\").select(lit(1).count).to_pandas()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n",
+ "to the `main` branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
}
+ },
+ "outputs": [],
+ "source": [
+ "table_env.from_path(\"main_catalog.nba.`salaries@main`\").select(lit(1).count).to_pandas()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "flink-demo",
+ "language": "python",
+ "name": "flink-demo"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/nessie-iceberg-hive-demo-nba.ipynb b/notebooks/nessie-iceberg-hive-demo-nba.ipynb
index 8dd639f8..e0680cb2 100644
--- a/notebooks/nessie-iceberg-hive-demo-nba.ipynb
+++ b/notebooks/nessie-iceberg-hive-demo-nba.ipynb
@@ -1,1084 +1,821 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "tags": []
- },
- "source": [
- "Nessie Iceberg/Hive SQL Demo with NBA Dataset\n",
- "============================\n",
- "This demo showcases how to use Nessie Python API along with Hive from Iceberg\n",
- "\n",
- "Initialize PyHive\n",
- "----------------------------------------------\n",
- "To get started, we will first have to do a few setup steps that give us everything we need\n",
- "to get started with Nessie. In case you're interested in the detailed setup steps for Hive, you can check out the [docs](https://projectnessie.org/tools/iceberg/hive/)\n",
- "\n",
- "The Binder server has downloaded Hive, Hadoop and some data for us as well as started a Nessie server in the background. All we have to do is to connect to Hive session.\n",
- "\n",
- "The below cell starts a local Hive session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "\n",
- "Hive running\n",
- "\n",
- "\n",
- "\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "from pyhive import hive\n",
- "from pynessie import init\n",
- "\n",
- "# where we will store our data\n",
- "warehouse = \"file://\" + os.path.join(os.getcwd(), \"nessie_warehouse\")\n",
- "\n",
- "# where our datasets are located\n",
- "datasets_path = \"file://\" + os.path.join(os.path.dirname(os.getcwd()), \"datasets\")\n",
- "\n",
- "nessie_client = init()\n",
- "\n",
- "\n",
- "def create_ref_catalog(ref):\n",
- " \"\"\"\n",
- " Create a branch and switch the current ref to the created branch\n",
- " \"\"\"\n",
- " default_branch = nessie_client.get_default_branch()\n",
- " if ref != default_branch:\n",
- " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n",
- " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n",
- " return switch_ref_catalog(ref)\n",
- "\n",
- "\n",
- "def switch_ref_catalog(ref):\n",
- " \"\"\"\n",
- " Switch a branch. When we switch the branch via Hive, we will need to reconnect to Hive\n",
- " \"\"\"\n",
- " # The important args below are:\n",
- " # catalog-impl: which Iceberg catalog to use, in this case we want NessieCatalog\n",
- " # uri: the location of the nessie server.\n",
- " # ref: the Nessie ref/branch we want to use (defaults to main)\n",
- " # warehouse: the location this catalog should store its data\n",
- " return hive.connect(\n",
- " \"localhost\",\n",
- " configuration={\n",
- " \"iceberg.catalog.dev_catalog.catalog-impl\": \"org.apache.iceberg.nessie.NessieCatalog\",\n",
- " \"iceberg.catalog.dev_catalog.uri\": \"http://localhost:19120/api/v1\",\n",
- " \"iceberg.catalog.dev_catalog.ref\": ref,\n",
- " \"iceberg.catalog.dev_catalog.warehouse\": warehouse,\n",
- " },\n",
- " ).cursor()\n",
- "\n",
- "\n",
- "print(\"\\n\\nHive running\\n\\n\\n\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Solving Data Engineering problems with Nessie\n",
- "============================\n",
- "\n",
- "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n",
- "\n",
- "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Set up Nessie branches (via Nessie CLI)\n",
- "----------------------------\n",
- "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n",
- "\n",
- "- Create a new branch named `dev`\n",
- "- List all branches\n",
- "\n",
- "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "current_ref = create_ref_catalog(\"dev\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n",
- "\n",
- "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n",
- "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n",
- "\u001b[0m\n"
- ]
- }
- ],
- "source": [
- "!nessie --verbose branch"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create tables under dev branch\n",
- "-------------------------------------\n",
- "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n",
- "\n",
- "We create two tables under the `dev` branch:\n",
- "- `salaries`\n",
- "- `totals_stats`\n",
- "\n",
- "These tables list the salaries per player per year and their stats per year.\n",
- "\n",
- "To create the data we:\n",
- "\n",
- "1. switch our branch context to dev\n",
- "2. create the table\n",
- "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Created schema nba\n",
- "\n",
- "\n",
- "Creating tables nba.salaries and nba.totals_stats....\n",
- "\n",
- "\n",
- "Created and inserted data into table nba.salaries from dataset salaries\n",
- "\n",
- "\n",
- "Created and inserted data into table nba.totals_stats from dataset totals_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# Creating our demo schema\n",
- "current_ref.execute(\"CREATE SCHEMA IF NOT EXISTS nba\")\n",
- "\n",
- "print(\"\\nCreated schema nba\\n\")\n",
- "\n",
- "\n",
- "print(\"\\nCreating tables nba.salaries and nba.totals_stats....\\n\")\n",
- "\n",
- "# Creating `salaries` table\n",
- "\n",
- "current_ref.execute(\n",
- " f\"\"\"CREATE TABLE IF NOT EXISTS nba.salaries (Season STRING,\n",
- " Team STRING, Salary STRING, Player STRING)\n",
- " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n",
- " LOCATION '{warehouse}/nba/salaries'\n",
- " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n",
- " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n",
- ")\n",
- "\n",
- "## We create a temporary table to load data into our target table since\n",
- "## is not possible to load data directly from CSV into non-native table.\n",
- "current_ref.execute(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS nba.salaries_temp (Season STRING,\n",
- " Team STRING, Salary STRING, Player STRING)\n",
- " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n",
- ")\n",
- "\n",
- "current_ref.execute(f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/salaries.csv\" OVERWRITE INTO TABLE nba.salaries_temp')\n",
- "current_ref.execute(\"INSERT OVERWRITE TABLE nba.salaries SELECT * FROM nba.salaries_temp\")\n",
- "\n",
- "print(\"\\nCreated and inserted data into table nba.salaries from dataset salaries\\n\")\n",
- "\n",
- "\n",
- "# Creating `totals_stats` table\n",
- "\n",
- "current_ref.execute(\n",
- " f\"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats (\n",
- " Season STRING, Age STRING, Team STRING, ORB STRING,\n",
- " DRB STRING, TRB STRING, AST STRING, STL STRING,\n",
- " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n",
- " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n",
- " LOCATION '{warehouse}/nba/totals_stats'\n",
- " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n",
- " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n",
- ")\n",
- "\n",
- "## We create a temporary table to load data into our target table since\n",
- "## is not possible to load data directly from CSV into non-native table.\n",
- "current_ref.execute(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats_temp (\n",
- " Season STRING, Age STRING, Team STRING, ORB STRING,\n",
- " DRB STRING, TRB STRING, AST STRING, STL STRING,\n",
- " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n",
- " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n",
- ")\n",
- "\n",
- "current_ref.execute(\n",
- " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/totals_stats.csv\" OVERWRITE INTO TABLE nba.totals_stats_temp'\n",
- ")\n",
- "current_ref.execute(\"INSERT OVERWRITE TABLE nba.totals_stats SELECT * FROM nba.totals_stats_temp\")\n",
- "\n",
- "print(\"\\nCreated and inserted data into table nba.totals_stats from dataset totals_stats\\n\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "Now we count the rows in our tables to ensure they are the same number as the csv files. Unlike Spark and Flink demos, we can't use the notation of `table@branch` (see the github issue [here](https://github.com/projectnessie/nessie/issues/1985). Therefore, we just set Nessie ref settings through Hive setting `SET iceberg.catalog.{catalog}.ref = {branch}` whenever we want to work on a specific branch."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- },
- "tags": []
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Counting rows in nba.salaries\n",
- "\n",
- "51\n",
- "\n",
- "Counting rows in nba.totals_stats\n",
- "\n",
- "93\n"
- ]
- }
- ],
- "source": [
- "# We make sure we are still in dev branch\n",
- "current_ref = switch_ref_catalog(\"dev\")\n",
- "\n",
- "print(\"\\nCounting rows in nba.salaries\\n\")\n",
- "\n",
- "# We count now\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n",
- "table_count = current_ref.fetchone()[0]\n",
- "\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n",
- "csv_count = current_ref.fetchone()[0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)\n",
- "\n",
- "print(\"\\nCounting rows in nba.totals_stats\\n\")\n",
- "\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n",
- "table_count = current_ref.fetchone()[0]\n",
- "\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n",
- "csv_count = current_ref.fetchone()[0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check generated tables\n",
- "----------------------------\n",
- "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n",
- "let's verify that the `main` branch was not altered by our changes."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "And on the `dev` branch we expect to see two tables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.totals_stats\n",
- "\tnba.salaries\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list --ref dev"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "We can also verify that the `dev` and `main` branches point to different commits"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- " dev d1ea40ccb14fd8365828bf740d73e8ed9d04ce5d9739020d00d7ffa5937cf9d3\n",
- "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n",
- "\u001b[0m\n"
- ]
- }
- ],
- "source": [
- "!nessie --verbose branch"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Dev promotion into main\n",
- "-----------------------\n",
- "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n",
- "We merge `dev` into `main` via the command line `merge` command.\n",
- "Both branches should be at the same revision after merging/promotion."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie merge dev -b main --force"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "We can verify that the `main` branch now contains the expected tables and row counts.\n",
- "\n",
- "The tables are now on `main` and ready for consumption by our blog authors and analysts!"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[33m* main 330f993ac08aceb2252702611f6bf1a92f49ac2e3fc709b250a017ba4a9cded6\n",
- "\u001b[0m dev d1ea40ccb14fd8365828bf740d73e8ed9d04ce5d9739020d00d7ffa5937cf9d3\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie --verbose branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.totals_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Counting rows in nba.salaries\n",
- "\n",
- "51\n",
- "\n",
- "Counting rows in nba.totals_stats\n",
- "\n",
- "93\n"
- ]
- }
- ],
- "source": [
- "# We switch to main branch\n",
- "current_ref = switch_ref_catalog(\"main\")\n",
- "\n",
- "print(\"\\nCounting rows in nba.salaries\\n\")\n",
- "\n",
- "# We count now\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n",
- "table_count = current_ref.fetchone()[0]\n",
- "\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n",
- "csv_count = current_ref.fetchone()[0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)\n",
- "\n",
- "print(\"\\nCounting rows in nba.totals_stats\\n\")\n",
- "\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n",
- "table_count = current_ref.fetchone()[0]\n",
- "\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n",
- "csv_count = current_ref.fetchone()[0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "Perform regular ETL on the new tables\n",
- "-------------------\n",
- "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n",
- "\n",
- "1. Update the salaries table to add new data\n",
- "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n",
- "3. We create a new table to hold information about the players appearances in all star games\n",
- "\n",
- "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "current_ref = create_ref_catalog(\"etl\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [],
- "source": [
- "# add some salaries for Kevin Durant\n",
- "current_ref.execute(\n",
- " \"\"\"INSERT INTO nba.salaries\n",
- " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n",
- " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n",
- " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n",
- " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Creating table nba.allstar_games_stats\n",
- "\n",
- "\n",
- "Created and inserted data into table nba.allstar_table_temp from dataset allstar_games_stats\n",
- "\n",
- "\n",
- "Counting rows in nba.allstar_games_stats\n",
- "\n",
- "48\n"
- ]
- }
- ],
- "source": [
- "print(\"\\nCreating table nba.allstar_games_stats\\n\")\n",
- "\n",
- "# Creating `allstar_games_stats` table\n",
- "current_ref.execute(\n",
- " f\"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_games_stats (\n",
- " Season STRING, Age STRING, Team STRING, ORB STRING,\n",
- " TRB STRING, AST STRING, STL STRING, BLK STRING,\n",
- " TOV STRING, PF STRING, PTS STRING, Player STRING)\n",
- " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n",
- " LOCATION '{warehouse}/nba/allstar_games_stats'\n",
- " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n",
- " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n",
- ")\n",
- "\n",
- "## We create a temporary table to load data into our target table since\n",
- "## is not possible to load data directly from CSV into non-native table.\n",
- "current_ref.execute(\n",
- " \"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_table_temp (\n",
- " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING,\n",
- " AST STRING, STL STRING, BLK STRING,\n",
- " TOV STRING, PF STRING, PTS STRING, Player STRING)\n",
- " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n",
- ")\n",
- "\n",
- "current_ref.execute(\n",
- " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/allstar_games_stats.csv\" OVERWRITE INTO TABLE nba.allstar_table_temp'\n",
- ")\n",
- "current_ref.execute(\"INSERT OVERWRITE TABLE nba.allstar_games_stats SELECT * FROM nba.allstar_table_temp\")\n",
- "\n",
- "print(\"\\nCreated and inserted data into table nba.allstar_table_temp from dataset allstar_games_stats\\n\")\n",
- "\n",
- "\n",
- "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n",
- "\n",
- "# Since we can't do 'table@branch'\n",
- "current_ref = switch_ref_catalog(\"etl\")\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n",
- "print(current_ref.fetchone()[0])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "We can verify that the new table isn't on the `main` branch but is present on the etl branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.totals_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n",
- "!nessie content list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.allstar_games_stats\n",
- "\tnba.salaries\n",
- "\tnba.totals_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# We should see the new `allstar_games_stats` table on the `etl` branch\n",
- "!nessie content list --ref etl"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Now that we are happy with the data we can again merge it into `main`"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie merge etl -b main --force"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%% md\n"
- }
- },
- "source": [
- "Now lets verify that the changes exist on the `main` branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.allstar_games_stats\n",
- "\tnba.totals_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie content list"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\u001b[33m* main 11ed5b46713231a5fb85f31083d47dbf6bfa1df5839bebbac08301cda8afe22f\n",
- "\u001b[0m etl a3e06ba7595dfdb8bc67b0d6825587d2858cfe2b013bf1b95c5a1471578c4af3\n",
- " dev d1ea40ccb14fd8365828bf740d73e8ed9d04ce5d9739020d00d7ffa5937cf9d3\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie --verbose branch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Counting rows in nba.allstar_games_stats\n",
- "\n",
- "48\n"
- ]
- }
- ],
- "source": [
- "# We switch to the main branch\n",
- "current_ref = switch_ref_catalog(\"main\")\n",
- "\n",
- "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n",
- "\n",
- "# We count now\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n",
- "table_count = current_ref.fetchone()[0]\n",
- "\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_table_temp\")\n",
- "csv_count = current_ref.fetchone()[0]\n",
- "assert table_count == csv_count\n",
- "print(table_count)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create `experiment` branch\n",
- "--------------------------------\n",
- "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n",
- "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n",
- "and carry out our experiment, which could consist of the following steps:\n",
- "- drop `totals_stats` table\n",
- "- add data to `salaries` table\n",
- "- compare `experiment` and `main` tables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "current_ref = create_ref_catalog(\"experiment\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Drop the `totals_stats` table on the `experiment` branch\n",
- "current_ref.execute(\"DROP TABLE nba.totals_stats\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "# add some salaries for Dirk Nowitzki\n",
- "current_ref.execute(\n",
- " \"\"\"INSERT INTO nba.salaries VALUES\n",
- " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n",
- " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n",
- " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n",
- " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n",
- ")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.allstar_games_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `totals_stats`)\n",
- "!nessie content list --ref experiment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "jupyter": {
- "outputs_hidden": false
- },
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "ICEBERG_TABLE:\n",
- "\tnba.salaries\n",
- "\tnba.allstar_games_stats\n",
- "\tnba.totals_stats\n",
- "\n"
- ]
- }
- ],
- "source": [
- "# `main` hasn't been changed and still has the `totals_stats` table\n",
- "!nessie content list"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Let's take a look at the contents of the `salaries` table on the `experiment` branch."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Counting rows in nba.salaries\n",
- "\n",
- "59\n"
- ]
- }
- ],
- "source": [
- "current_ref = switch_ref_catalog(\"experiment\")\n",
- "\n",
- "print(\"\\nCounting rows in nba.salaries\\n\")\n",
- "\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n",
- "print(current_ref.fetchone()[0])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "and compare to the contents of the `salaries` table on the `main` branch."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "pycharm": {
- "name": "#%%\n"
- }
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "Counting rows in nba.salaries\n",
- "\n",
- "56\n"
- ]
- }
- ],
- "source": [
- "current_ref = switch_ref_catalog(\"main\")\n",
- "\n",
- "# the following INSERT is a workaround for https://github.com/apache/iceberg/pull/4509 until iceberg 0.13.2 is released\n",
- "# add a single salary for Dirk Nowitzki (so we expect 3 less total rows)\n",
- "current_ref.execute(\n",
- " \"\"\"INSERT INTO nba.salaries VALUES\n",
- " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n",
- ")\n",
- "\n",
- "print(\"\\nCounting rows in nba.salaries\\n\")\n",
- "\n",
- "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n",
- "print(current_ref.fetchone()[0])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And finally lets clean up after ourselves"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "\n",
- "\n"
- ]
- }
- ],
- "source": [
- "!nessie branch --delete dev\n",
- "!nessie branch --delete etl\n",
- "!nessie branch --delete experiment"
- ]
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "tags": []
+ },
+ "source": [
+ "Nessie Iceberg/Hive SQL Demo with NBA Dataset\n",
+ "============================\n",
+ "This demo showcases how to use Nessie Python API along with Hive from Iceberg\n",
+ "\n",
+ "Initialize PyHive\n",
+ "----------------------------------------------\n",
+ "To get started, we will first have to do a few setup steps that give us everything we need\n",
+ "to get started with Nessie. In case you're interested in the detailed setup steps for Hive, you can check out the [docs](https://projectnessie.org/tools/iceberg/hive/)\n",
+ "\n",
+ "The Binder server has downloaded Hive, Hadoop and some data for us as well as started a Nessie server in the background. All we have to do is to connect to Hive session.\n",
+ "\n",
+ "The below cell starts a local Hive session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "import requests\n",
+ "from pyhive import hive\n",
+ "from pynessie import init\n",
+ "\n",
+ "# where we will store our data\n",
+ "warehouse = \"file://\" + os.path.join(os.getcwd(), \"nessie_warehouse\")\n",
+ "\n",
+ "# where our datasets are located\n",
+ "datasets_path = \"file://\" + os.path.join(os.path.dirname(os.getcwd()), \"datasets\")\n",
+ "\n",
+ "nessie_client = init()\n",
+ "\n",
+ "\n",
+ "def create_namespace(ref: str, namespace: list[str]):\n",
+ " hash = nessie_client.get_reference(ref).hash_\n",
+ " # pynessie client has currently no code to create namespace, issue a plain REST request.\n",
+ " response = requests.post(\n",
+ " url=f\"http://127.0.0.1:19120/api/v2/trees/{ref}@{hash}/history/commit\",\n",
+ " headers={\"Accept\": \"application/json\", \"Content-Type\": \"application/json\"},\n",
+ " json={\n",
+ " \"commitMeta\": {\"message\": \"Create namespace nba\"},\n",
+ " \"operations\": [{\"type\": \"PUT\", \"key\": {\"elements\": namespace}, \"content\": {\"type\": \"NAMESPACE\"}}],\n",
+ " },\n",
+ " )\n",
+ " if response.status_code != 200:\n",
+ " raise Exception(f\"Could not create namespace: HTTP {response.status_code} {response.reason}: {response.json()}\")\n",
+ "\n",
+ "\n",
+ "def create_ref_catalog(ref: str):\n",
+ " \"\"\"\n",
+ " Create a branch and switch the current ref to the created branch\n",
+ " \"\"\"\n",
+ " default_branch = nessie_client.get_default_branch()\n",
+ " if ref != default_branch:\n",
+ " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n",
+ " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n",
+ " return switch_ref_catalog(ref)\n",
+ "\n",
+ "\n",
+ "def switch_ref_catalog(ref: str):\n",
+ " \"\"\"\n",
+ " Switch a branch. When we switch the branch via Hive, we will need to reconnect to Hive\n",
+ " \"\"\"\n",
+ " # The important args below are:\n",
+ " # catalog-impl: which Iceberg catalog to use, in this case we want NessieCatalog\n",
+ " # uri: the location of the nessie server.\n",
+ " # ref: the Nessie ref/branch we want to use (defaults to main)\n",
+ " # warehouse: the location this catalog should store its data\n",
+ " return hive.connect(\n",
+ " \"localhost\",\n",
+ " configuration={\n",
+ " \"iceberg.catalog.dev_catalog.catalog-impl\": \"org.apache.iceberg.nessie.NessieCatalog\",\n",
+ " \"iceberg.catalog.dev_catalog.uri\": \"http://localhost:19120/api/v1\",\n",
+ " \"iceberg.catalog.dev_catalog.ref\": ref,\n",
+ " \"iceberg.catalog.dev_catalog.warehouse\": warehouse,\n",
+ " },\n",
+ " ).cursor()\n",
+ "\n",
+ "\n",
+ "create_namespace(\"main\", [\"nba\"])\n",
+ "\n",
+ "\n",
+ "print(\"\\n\\nHive running\\n\\n\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Solving Data Engineering problems with Nessie\n",
+ "============================\n",
+ "\n",
+ "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n",
+ "\n",
+ "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Set up Nessie branches (via Nessie CLI)\n",
+ "----------------------------\n",
+ "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n",
+ "\n",
+ "- Create a new branch named `dev`\n",
+ "- List all branches\n",
+ "\n",
+ "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "current_ref = create_ref_catalog(\"dev\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
}
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3 (ipykernel)",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.7.12"
+ },
+ "source": [
+ "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n",
+ "\n",
+ "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
}
+ },
+ "outputs": [],
+ "source": [
+ "!nessie --verbose branch"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create tables under dev branch\n",
+ "-------------------------------------\n",
+ "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n",
+ "\n",
+ "We create two tables under the `dev` branch:\n",
+ "- `salaries`\n",
+ "- `totals_stats`\n",
+ "\n",
+ "These tables list the salaries per player per year and their stats per year.\n",
+ "\n",
+ "To create the data we:\n",
+ "\n",
+ "1. switch our branch context to dev\n",
+ "2. create the table\n",
+ "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Creating our demo schema\n",
+ "current_ref.execute(\"CREATE SCHEMA IF NOT EXISTS nba\")\n",
+ "\n",
+ "print(\"\\nCreated schema nba\\n\")\n",
+ "\n",
+ "\n",
+ "print(\"\\nCreating tables nba.salaries and nba.totals_stats....\\n\")\n",
+ "\n",
+ "# Creating `salaries` table\n",
+ "\n",
+ "current_ref.execute(\n",
+ " f\"\"\"CREATE TABLE IF NOT EXISTS nba.salaries (Season STRING,\n",
+ " Team STRING, Salary STRING, Player STRING)\n",
+ " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n",
+ " LOCATION '{warehouse}/nba/salaries'\n",
+ " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n",
+ " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n",
+ ")\n",
+ "\n",
+ "## We create a temporary table to load data into our target table since\n",
+ "## is not possible to load data directly from CSV into non-native table.\n",
+ "current_ref.execute(\n",
+ " \"\"\"CREATE TABLE IF NOT EXISTS nba.salaries_temp (Season STRING,\n",
+ " Team STRING, Salary STRING, Player STRING)\n",
+ " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n",
+ ")\n",
+ "\n",
+ "current_ref.execute(f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/salaries.csv\" OVERWRITE INTO TABLE nba.salaries_temp')\n",
+ "current_ref.execute(\"INSERT OVERWRITE TABLE nba.salaries SELECT * FROM nba.salaries_temp\")\n",
+ "\n",
+ "print(\"\\nCreated and inserted data into table nba.salaries from dataset salaries\\n\")\n",
+ "\n",
+ "\n",
+ "# Creating `totals_stats` table\n",
+ "\n",
+ "current_ref.execute(\n",
+ " f\"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats (\n",
+ " Season STRING, Age STRING, Team STRING, ORB STRING,\n",
+ " DRB STRING, TRB STRING, AST STRING, STL STRING,\n",
+ " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n",
+ " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n",
+ " LOCATION '{warehouse}/nba/totals_stats'\n",
+ " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n",
+ " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n",
+ ")\n",
+ "\n",
+ "## We create a temporary table to load data into our target table since\n",
+ "## is not possible to load data directly from CSV into non-native table.\n",
+ "current_ref.execute(\n",
+ " \"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats_temp (\n",
+ " Season STRING, Age STRING, Team STRING, ORB STRING,\n",
+ " DRB STRING, TRB STRING, AST STRING, STL STRING,\n",
+ " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n",
+ " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n",
+ ")\n",
+ "\n",
+ "current_ref.execute(\n",
+ " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/totals_stats.csv\" OVERWRITE INTO TABLE nba.totals_stats_temp'\n",
+ ")\n",
+ "current_ref.execute(\"INSERT OVERWRITE TABLE nba.totals_stats SELECT * FROM nba.totals_stats_temp\")\n",
+ "\n",
+ "print(\"\\nCreated and inserted data into table nba.totals_stats from dataset totals_stats\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "Now we count the rows in our tables to ensure they are the same number as the csv files. Unlike Spark and Flink demos, we can't use the notation of `table@branch` (see the github issue [here](https://github.com/projectnessie/nessie/issues/1985). Therefore, we just set Nessie ref settings through Hive setting `SET iceberg.catalog.{catalog}.ref = {branch}` whenever we want to work on a specific branch."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ },
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# We make sure we are still in dev branch\n",
+ "current_ref = switch_ref_catalog(\"dev\")\n",
+ "\n",
+ "print(\"\\nCounting rows in nba.salaries\\n\")\n",
+ "\n",
+ "# We count now\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n",
+ "table_count = current_ref.fetchone()[0]\n",
+ "\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n",
+ "csv_count = current_ref.fetchone()[0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)\n",
+ "\n",
+ "print(\"\\nCounting rows in nba.totals_stats\\n\")\n",
+ "\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n",
+ "table_count = current_ref.fetchone()[0]\n",
+ "\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n",
+ "csv_count = current_ref.fetchone()[0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Check generated tables\n",
+ "----------------------------\n",
+ "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n",
+ "let's verify that the `main` branch was not altered by our changes."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "And on the `dev` branch we expect to see two tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nessie content list --ref dev"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "We can also verify that the `dev` and `main` branches point to different commits"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nessie --verbose branch"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Dev promotion into main\n",
+ "-----------------------\n",
+ "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n",
+ "We merge `dev` into `main` via the command line `merge` command.\n",
+ "Both branches should be at the same revision after merging/promotion."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nessie merge dev -b main --force"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "We can verify that the `main` branch now contains the expected tables and row counts.\n",
+ "\n",
+ "The tables are now on `main` and ready for consumption by our blog authors and analysts!"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nessie --verbose branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# We switch to main branch\n",
+ "current_ref = switch_ref_catalog(\"main\")\n",
+ "\n",
+ "print(\"\\nCounting rows in nba.salaries\\n\")\n",
+ "\n",
+ "# We count now\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n",
+ "table_count = current_ref.fetchone()[0]\n",
+ "\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n",
+ "csv_count = current_ref.fetchone()[0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)\n",
+ "\n",
+ "print(\"\\nCounting rows in nba.totals_stats\\n\")\n",
+ "\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n",
+ "table_count = current_ref.fetchone()[0]\n",
+ "\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n",
+ "csv_count = current_ref.fetchone()[0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "Perform regular ETL on the new tables\n",
+ "-------------------\n",
+ "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n",
+ "\n",
+ "1. Update the salaries table to add new data\n",
+ "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n",
+ "3. We create a new table to hold information about the players appearances in all star games\n",
+ "\n",
+ "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "current_ref = create_ref_catalog(\"etl\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# add some salaries for Kevin Durant\n",
+ "current_ref.execute(\n",
+ " \"\"\"INSERT INTO nba.salaries\n",
+ " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n",
+ " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n",
+ " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n",
+ " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"\\nCreating table nba.allstar_games_stats\\n\")\n",
+ "\n",
+ "# Creating `allstar_games_stats` table\n",
+ "current_ref.execute(\n",
+ " f\"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_games_stats (\n",
+ " Season STRING, Age STRING, Team STRING, ORB STRING,\n",
+ " TRB STRING, AST STRING, STL STRING, BLK STRING,\n",
+ " TOV STRING, PF STRING, PTS STRING, Player STRING)\n",
+ " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n",
+ " LOCATION '{warehouse}/nba/allstar_games_stats'\n",
+ " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n",
+ " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n",
+ ")\n",
+ "\n",
+ "## We create a temporary table to load data into our target table since\n",
+ "## is not possible to load data directly from CSV into non-native table.\n",
+ "current_ref.execute(\n",
+ " \"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_table_temp (\n",
+ " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING,\n",
+ " AST STRING, STL STRING, BLK STRING,\n",
+ " TOV STRING, PF STRING, PTS STRING, Player STRING)\n",
+ " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n",
+ ")\n",
+ "\n",
+ "current_ref.execute(\n",
+ " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/allstar_games_stats.csv\" OVERWRITE INTO TABLE nba.allstar_table_temp'\n",
+ ")\n",
+ "current_ref.execute(\"INSERT OVERWRITE TABLE nba.allstar_games_stats SELECT * FROM nba.allstar_table_temp\")\n",
+ "\n",
+ "print(\"\\nCreated and inserted data into table nba.allstar_table_temp from dataset allstar_games_stats\\n\")\n",
+ "\n",
+ "\n",
+ "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n",
+ "\n",
+ "# Since we can't do 'table@branch'\n",
+ "current_ref = switch_ref_catalog(\"etl\")\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n",
+ "print(current_ref.fetchone()[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can verify that the new table isn't on the `main` branch but is present on the etl branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n",
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# We should see the new `allstar_games_stats` table on the `etl` branch\n",
+ "!nessie content list --ref etl"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that we are happy with the data we can again merge it into `main`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie merge etl -b main --force"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "Now lets verify that the changes exist on the `main` branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "!nessie --verbose branch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# We switch to the main branch\n",
+ "current_ref = switch_ref_catalog(\"main\")\n",
+ "\n",
+ "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n",
+ "\n",
+ "# We count now\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n",
+ "table_count = current_ref.fetchone()[0]\n",
+ "\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_table_temp\")\n",
+ "csv_count = current_ref.fetchone()[0]\n",
+ "assert table_count == csv_count\n",
+ "print(table_count)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Create `experiment` branch\n",
+ "--------------------------------\n",
+ "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n",
+ "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n",
+ "and carry out our experiment, which could consist of the following steps:\n",
+ "- drop `totals_stats` table\n",
+ "- add data to `salaries` table\n",
+ "- compare `experiment` and `main` tables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "current_ref = create_ref_catalog(\"experiment\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop the `totals_stats` table on the `experiment` branch\n",
+ "current_ref.execute(\"DROP TABLE nba.totals_stats\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# add some salaries for Dirk Nowitzki\n",
+ "current_ref.execute(\n",
+ " \"\"\"INSERT INTO nba.salaries VALUES\n",
+ " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n",
+ " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n",
+ " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n",
+ " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `totals_stats`)\n",
+ "!nessie content list --ref experiment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# `main` hasn't been changed and still has the `totals_stats` table\n",
+ "!nessie content list"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's take a look at the contents of the `salaries` table on the `experiment` branch."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "current_ref = switch_ref_catalog(\"experiment\")\n",
+ "\n",
+ "print(\"\\nCounting rows in nba.salaries\\n\")\n",
+ "\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n",
+ "print(current_ref.fetchone()[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "and compare to the contents of the `salaries` table on the `main` branch."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "current_ref = switch_ref_catalog(\"main\")\n",
+ "\n",
+ "# the following INSERT is a workaround for https://github.com/apache/iceberg/pull/4509 until iceberg 0.13.2 is released\n",
+ "# add a single salary for Dirk Nowitzki (so we expect 3 less total rows)\n",
+ "current_ref.execute(\n",
+ " \"\"\"INSERT INTO nba.salaries VALUES\n",
+ " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n",
+ ")\n",
+ "\n",
+ "print(\"\\nCounting rows in nba.salaries\\n\")\n",
+ "\n",
+ "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n",
+ "print(current_ref.fetchone()[0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And finally lets clean up after ourselves"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!nessie branch --delete dev\n",
+ "!nessie branch --delete etl\n",
+ "!nessie branch --delete experiment"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 4
-}
\ No newline at end of file
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/requirements_dev.txt b/notebooks/requirements_dev.txt
index 3d705656..fbbb4280 100644
--- a/notebooks/requirements_dev.txt
+++ b/notebooks/requirements_dev.txt
@@ -15,18 +15,18 @@
#
-r requirements.txt
assertpy==1.1
+build==0.10.0
bump2version==1.0.1
-build==0.8.0
-ipython==7.34.0
-jupyterlab==3.4.7
+ipython==8.18.0
+jupyterlab==3.6.6
nbstripout==0.6.1
-pip==22.2.2
-pytest==7.1.3
-pytest-mock==3.8.2
-pytest-mypy==0.9.1
+pip==23.3.1
+pytest==7.4.3
+pytest-mock==3.12.0
+pytest-mypy==0.10.3
pytest-runner==6.0.0
testbook[dev]==0.4.2
-tox==3.26.0
+tox==4.11.3
twine==4.0.1
watchdog==2.1.9
-wheel==0.37.1
+wheel==0.41.3
diff --git a/notebooks/requirements_lint.txt b/notebooks/requirements_lint.txt
index 57328b6c..6936aebc 100644
--- a/notebooks/requirements_lint.txt
+++ b/notebooks/requirements_lint.txt
@@ -14,14 +14,13 @@
# limitations under the License.
#
-r requirements_dev.txt
-bandit==1.7.4
-black[jupyter]==22.8.0
-flake8==5.0.4
-flake8-annotations==2.9.1
+bandit==1.7.5
+black[jupyter]==23.11.0
+flake8==6.1.0
+flake8-annotations==3.0.1
flake8-bandit==4.1.1
-flake8-black==0.3.3
-flake8-bugbear==22.9.23
-flake8-docstrings==1.6.0
-flake8-import-order==0.18.1
-pytest-mypy==0.9.1
-safety==2.2.0
+flake8-black==0.3.6
+flake8-bugbear==23.9.16
+flake8-docstrings==1.7.0
+flake8-import-order==0.18.2
+pytest-mypy==0.10.3
diff --git a/notebooks/tests/__init__.py b/notebooks/tests/__init__.py
index 4c3decb3..3e363460 100644
--- a/notebooks/tests/__init__.py
+++ b/notebooks/tests/__init__.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
#
# Copyright (C) 2020 Dremio
#
@@ -16,14 +17,12 @@
#
"""Unit tests for demo notebooks."""
import os
-import platform
import shutil
import subprocess # noqa: S404
from contextlib import contextmanager
from typing import Iterator
from typing import List
-from utils import fetch_nessie
from utils import fetch_nessie_jar
@@ -66,12 +65,5 @@ def start_nessie() -> Iterator[subprocess.Popen]:
def _fetch_and_get_nessie_start_command() -> List[str]:
- operating_system = platform.system().lower()
-
- if operating_system == "darwin":
- # In Mac case, we use the nessie jar
- runner = fetch_nessie_jar()
- return ["java", "-jar", runner]
- else:
- runner = fetch_nessie()
- return ["./" + runner]
+ runner = fetch_nessie_jar()
+ return ["java", "-jar", runner]
diff --git a/notebooks/tests/scripts/start_hive b/notebooks/tests/scripts/start_hive
index 7e3f03a6..5d2c50f6 100755
--- a/notebooks/tests/scripts/start_hive
+++ b/notebooks/tests/scripts/start_hive
@@ -10,6 +10,25 @@ HIVE_VERSION=$(python -c "import utils;print(utils._HIVE_VERSION)")
export HADOOP_HOME=$PWD/hadoop-$HADOOP_VERSION
+# Check for Java 8 + 11 for tox (also in /docker/binder/start.hive)
+if [[ -z ${JAVA8_HOME} || -z ${JAVA11_HOME} || ! -d ${JAVA8_HOME} || ! -d ${JAVA11_HOME} ]] ; then
+ cat < /dev/stderr
+
+
+============================================================================================================
+Define the JAVA8_HOME and JAVA11_HOME environment variables to point to Java 8 and Java 11 development kits.
+============================================================================================================
+
+Need Java 8 for Hive server to work.
+Java 11 (not newer!) is required for Spark, but also Nessie.
+
+
+!
+ exit 1
+fi
+
#Start Hive
+echo "Starting Hive for tox, current directory: $(pwd)"
+rm -f nohup.out derby.log
chmod +x $PWD/../docker/binder/start.hive
-nohup $PWD/../docker/binder/start.hive $PWD $PWD/../docker/binder/resources $HIVE_VERSION
\ No newline at end of file
+nohup $PWD/../docker/binder/start.hive $PWD $PWD/../docker/binder/resources $HIVE_VERSION
diff --git a/notebooks/tests/test_nessie_delta_demo_nba.py b/notebooks/tests/test_nessie_delta_demo_nba.py
deleted file mode 100644
index e870776e..00000000
--- a/notebooks/tests/test_nessie_delta_demo_nba.py
+++ /dev/null
@@ -1,100 +0,0 @@
-#!/usr/bin/env python
-#
-# Copyright (C) 2020 Dremio
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-"""Tests the Nessie + Iceberg + Spark Jupyter Notebook with the NBA dataset."""
-from typing import Iterator
-
-import pytest
-from assertpy import assert_that
-from assertpy.assertpy import AssertionBuilder
-from testbook import testbook
-from testbook.client import TestbookNotebookClient
-from utils import fetch_spark
-
-from . import _find_notebook
-from . import _remove_folders
-from . import start_nessie
-
-num_salaries_on_experiment = """count(1)
-0 58"""
-
-num_salaries_on_main = """count(1)
-0 54"""
-
-
-@pytest.fixture(scope="module")
-def notebook() -> Iterator[TestbookNotebookClient]:
- """Pytest fixture to run a notebook."""
- path_to_notebook = _find_notebook("nessie-delta-demo-nba.ipynb")
- fetch_spark()
-
- with start_nessie():
- with testbook(path_to_notebook, timeout=300) as tb:
- tb.execute()
- yield tb
- # Clean all the folders that being created by this test
- _remove_folders(["spark-warehouse", "spark_warehouse"])
-
-
-def _assert_that_notebook(
- text: str, notebook: TestbookNotebookClient, count: int = 0
-) -> AssertionBuilder:
- for seen, value in enumerate(
- n for n, i in enumerate(notebook.cells) if text in i["source"]
- ):
- if seen == count:
- return assert_that(notebook.cell_output_text(value))
- raise Exception(f"Unable to find cell with text: {text}")
-
-
-def test_notebook_output(notebook: TestbookNotebookClient) -> None:
- """Runs through the entire notebook and checks the output.
-
- :param notebook: The notebook to test
- :return:
- """
- assertion = lambda x: _assert_that_notebook(x, notebook) # NOQA
- assertion_counted = lambda x, y: _assert_that_notebook(x, notebook, y) # NOQA
-
- assertion("findspark.init").contains("Spark Running")
-
- assertion("CREATE BRANCH dev").contains("Branch").contains("dev")
-
- assertion("INSERT INTO totals_stats SELECT * FROM stats_table").is_equal_to(
- "Empty DataFrame\nColumns: []\nIndex: []"
- )
-
- assertion_counted("LIST REFERENCES", 1).contains("main").contains("dev").contains(
- "Branch"
- )
-
- assertion_counted(
- 'spark.sql("select count(*) from salaries").toPandas()', 2
- ).is_equal_to(num_salaries_on_experiment)
-
- assertion_counted(
- 'spark.sql("select count(*) from salaries").toPandas()', 3
- ).is_equal_to(num_salaries_on_main)
-
-
-def test_dependency_setup(notebook: TestbookNotebookClient) -> None:
- """Verifies that dependencies were correctly set up.
-
- :param notebook: The notebook to test
- :return:
- """
- spark = notebook.ref("spark")
- assert_that(spark).is_not_none()
diff --git a/notebooks/tox.ini b/notebooks/tox.ini
index 014810dd..49dac650 100644
--- a/notebooks/tox.ini
+++ b/notebooks/tox.ini
@@ -15,24 +15,24 @@
#
[tox]
-envlist = py37, format, lint, flink, hive
+envlist = py310, format, lint, flink, hive
skipsdist = True
[gh-actions]
python =
- 3.7: py37, lint, flink, hive
+ 3.10: py310, lint, flink, hive
[testenv:format]
allowlist_externals=bash
deps =
-r{toxinidir}/requirements_lint.txt
commands =
- black --target-version py37 tests format_notebooks.py
+ black --target-version py310 tests ./format_notebooks.py
python -m format_notebooks
# this formats python code inside the notebooks
- bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py37 --line-length 120 --ipynb'
+ bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py310 --line-length 120 --ipynb'
# this formats cell output from single string to list of strings and removes execution metadata
- bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose nbstripout --keep-output --strip-empty-cells'
+ bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose nbstripout --keep-output --drop-empty-cells'
python -m format_notebooks
[testenv:lint]
@@ -40,15 +40,14 @@ allowlist_externals=bash
deps =
-r{toxinidir}/requirements_lint.txt
commands =
- flake8 tests format_notebooks.py
- bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py37 --line-length 120 --ipynb --check'
+ flake8 tests ./format_notebooks.py
+ bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py310 --line-length 120 --ipynb --check'
[testenv:flink]
setenv =
PYTHONPATH = {toxinidir}:{toxinidir}/../docker
-passenv = TOXENV CI CODECOV_*
+passenv = TOXENV,CI,CODECOV_*
deps =
- --use-deprecated=legacy-resolver
-r{toxinidir}/../docker/binder/requirements_flink.txt
-r{toxinidir}/requirements_dev.txt
commands =
@@ -59,7 +58,7 @@ commands =
allowlist_externals=bash
setenv =
PYTHONPATH = {toxinidir}:{toxinidir}/../docker
-passenv = TOXENV CI CODECOV_* JAVA_HOME
+passenv = TOXENV,CI,CODECOV_*,JAVA_HOME,JAVA8_HOME,JAVA11_HOME
deps =
-r{toxinidir}/../docker/binder/requirements.txt
-r{toxinidir}/requirements_dev.txt
@@ -71,10 +70,10 @@ commands =
[testenv]
setenv =
PYTHONPATH = {toxinidir}:{toxinidir}/../docker
-passenv = TOXENV CI CODECOV_*
+passenv = TOXENV,CI,CODECOV_*
deps =
-r{toxinidir}/../docker/binder/requirements.txt
-r{toxinidir}/requirements_dev.txt
commands =
- nbstripout {toxinidir}/nessie-iceberg-demo-nba.ipynb {toxinidir}/nessie-delta-demo-nba.ipynb
+ nbstripout {toxinidir}/nessie-iceberg-demo-nba.ipynb
pytest --basetemp={envtmpdir} -ra tests --ignore tests/test_nessie_iceberg_flink_demo_nba.py --ignore tests/test_nessie_iceberg_hive_demo_nba.py