diff --git a/.github/workflows/demos-docker-build.yaml b/.github/workflows/demos-docker-build.yaml index d269b0cc..18ec2367 100644 --- a/.github/workflows/demos-docker-build.yaml +++ b/.github/workflows/demos-docker-build.yaml @@ -34,7 +34,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.7] + python-version: [3.10] steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/notebooks.yaml b/.github/workflows/notebooks.yaml index 4330c174..3e504c77 100644 --- a/.github/workflows/notebooks.yaml +++ b/.github/workflows/notebooks.yaml @@ -34,7 +34,7 @@ jobs: strategy: max-parallel: 4 matrix: - python-version: [3.7] + python-version: [3.10] steps: - uses: actions/checkout@v3 diff --git a/.gitignore b/.gitignore index ef780a61..7a3f27c2 100644 --- a/.gitignore +++ b/.gitignore @@ -38,6 +38,9 @@ venv/ __pycache__/ .pytest_cache +# pyenv +.python-version + # Jetbrains IDEs /.idea *.iws diff --git a/README.md b/README.md index 378e17c4..72d1abaa 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ Nessie version is set in Binder at `docker/binder/requirements_base.txt`. Curren ### Iceberg -Currently we are using Iceberg `0.13.1` and it is specified in both iceberg notebooks as well as `docker/utils/__init__.py` +Currently we are using Iceberg `1.4.2` and it is specified in both iceberg notebooks as well as `docker/utils/__init__.py` ### Spark @@ -30,7 +30,7 @@ Only has to be updated in `docker/binder/requirements.txt`. Currently, Iceberg s ### Flink -Flink version is set in Binder at `docker/binder/requirements_flink.txt`. Currently, we are using `1.13.6`. +Flink version is set in Binder at `docker/binder/requirements_flink.txt`. Currently, we are using `1.17.1`. ### Hadoop @@ -53,7 +53,7 @@ Of course, Binder just lets a user "simply start" a notebook via a simple "click ## Development For development, you will need to make sure to have the following installed: -- Python 3.7+ +- Python 3.10+ - pre-commit Regarding pre-commit, you will need to make sure is installed through `pre-commit install` in order to install the hooks locally since this repo diff --git a/binder/Dockerfile b/binder/Dockerfile index 53efebc4..39d5fb60 100644 --- a/binder/Dockerfile +++ b/binder/Dockerfile @@ -2,7 +2,7 @@ # Tag will be automatically generated through pre-commit hook if any changes # happened in the docker/ folder -FROM ghcr.io/projectnessie/nessie-binder-demos:649ec80b8fa7d9666178380a33b2e645a52d5985 +FROM ghcr.io/projectnessie/nessie-binder-demos:94fbc35d252dbf5e421e1088a842023ef74f8839 # Create the necessary folders for the demo, this will be created and owned by {NB_USER} RUN mkdir -p notebooks && mkdir -p datasets diff --git a/binder/README.md b/binder/README.md index ba7274a6..55165bc5 100644 --- a/binder/README.md +++ b/binder/README.md @@ -1,8 +1,8 @@ ## Building binder locally ### Prerequisites -You need to have a python 3.7+ installed. -We recommend to use [pyenv](https://github.com/pyenv/pyenv) for managing your python environment(s). +You need to have a python 3.11+ installed. +We recommend to use [pyenv](https://github.com/pyenv/pyenv) for managing your python environment(s). To build the binder image locally, firstly, you need to install `jupyter-repo2docker` dependency: @@ -29,8 +29,8 @@ Run (or look into) the `build_run_local_docker.sh` script how to do this semi-au After those steps, the binder should be running on your local machine. Next, find the output similar to this: ```shell -[C 13:38:25.199 NotebookApp] - +[C 13:38:25.199 NotebookApp] + To access the notebook, open this file in a browser: file:///home/jovyan/.local/share/jupyter/runtime/nbserver-40-open.html Or copy and paste this URL: diff --git a/docker/binder/postBuild b/docker/binder/postBuild index d23de2c6..d7ba9374 100644 --- a/docker/binder/postBuild +++ b/docker/binder/postBuild @@ -26,7 +26,7 @@ python -m ipykernel install --name "flink-demo" --user python -c "import utils;utils._copy_all_hadoop_jars_to_pyflink()" conda deactivate -python -c "import utils;utils.fetch_nessie()" +python -c "import utils;utils.fetch_nessie_jar()" python -c "import utils;utils.fetch_spark()" diff --git a/docker/binder/requirements.txt b/docker/binder/requirements.txt index a95841b6..acebdc06 100644 --- a/docker/binder/requirements.txt +++ b/docker/binder/requirements.txt @@ -1,5 +1,5 @@ -r requirements_base.txt findspark==2.0.1 -pandas==1.3.5 -pyhive[hive]==0.6.5 -pyspark==3.2.1 +pandas==1.5.3 +pyhive[hive_pure_sasl]==0.7.0 +pyspark==3.5.0 diff --git a/docker/binder/requirements_base.txt b/docker/binder/requirements_base.txt index b842f398..303085b3 100644 --- a/docker/binder/requirements_base.txt +++ b/docker/binder/requirements_base.txt @@ -1 +1 @@ -pynessie==0.30.0 +pynessie==0.65.0 diff --git a/docker/binder/requirements_flink.txt b/docker/binder/requirements_flink.txt index 3e1775e6..664b2d3f 100644 --- a/docker/binder/requirements_flink.txt +++ b/docker/binder/requirements_flink.txt @@ -1,4 +1,2 @@ -r requirements_base.txt -apache-flink==1.13.6 -# flink requires pandas<1.2.0 see https://github.com/apache/flink/blob/release-1.13.6/flink-python/setup.py#L313 -pandas==1.1.5 +apache-flink==1.17.1 diff --git a/docker/binder/start.hive b/docker/binder/start.hive index c8ddd968..d2a06e24 100755 --- a/docker/binder/start.hive +++ b/docker/binder/start.hive @@ -39,7 +39,8 @@ fi export HIVE_HOME=$HIVE_PARENT_DIR/$HIVE_FOLDER_NAME # Create hive warehouse folder -mkdir $HIVE_WAREHOUSE_DIR +rm -rf $HIVE_WAREHOUSE_DIR +mkdir -p $HIVE_WAREHOUSE_DIR # Copy the needed configs to Hive folder cp $RESOURCE_DIR/hive/config/hive-site.xml ${HIVE_HOME}/conf/ diff --git a/docker/utils/__init__.py b/docker/utils/__init__.py index 4434fbd7..d55b6580 100644 --- a/docker/utils/__init__.py +++ b/docker/utils/__init__.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- # # Copyright (C) 2020 Dremio # @@ -18,7 +19,6 @@ import os import shutil import site -import stat import sysconfig import tarfile from typing import Optional @@ -29,7 +29,7 @@ import pyspark _SPARK_VERSION = pyspark.__version__ - _SPARK_FILENAME = f"spark-{_SPARK_VERSION}-bin-hadoop3.2" + _SPARK_FILENAME = f"spark-{_SPARK_VERSION}-bin-hadoop3" _SPARK_URL = f"https://archive.apache.org/dist/spark/spark-{_SPARK_VERSION}/{_SPARK_FILENAME}.tgz" except ImportError: _SPARK_VERSION = None @@ -40,22 +40,28 @@ _HADOOP_FILENAME = f"hadoop-{_HADOOP_VERSION}" _HADOOP_URL = f"https://archive.apache.org/dist/hadoop/common/hadoop-{_HADOOP_VERSION}/{_HADOOP_FILENAME}.tar.gz" -_FLINK_MAJOR_VERSION = "1.13" +_FLINK_MAJOR_VERSION = "1.17" -_ICEBERG_VERSION = "0.13.1" -_ICEBERG_FLINK_FILENAME = f"iceberg-flink-runtime-{_FLINK_MAJOR_VERSION}-{_ICEBERG_VERSION}.jar" +_ICEBERG_VERSION = "1.4.2" +_ICEBERG_FLINK_FILENAME = ( + f"iceberg-flink-runtime-{_FLINK_MAJOR_VERSION}-{_ICEBERG_VERSION}.jar" +) _ICEBERG_FLINK_URL = f"https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-{_FLINK_MAJOR_VERSION}/{_ICEBERG_VERSION}/{_ICEBERG_FLINK_FILENAME}" -_ICEBERG_HIVE_FILENAME = f"iceberg-hive-runtime-{_ICEBERG_VERSION}.jar" -_ICEBERG_HIVE_URL = f"https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-hive-runtime/{_ICEBERG_VERSION}/{_ICEBERG_HIVE_FILENAME}" +_ICEBERG_HIVE_FILENAME = f"iceberg-hive3-{_ICEBERG_VERSION}.jar" +_ICEBERG_HIVE_URL = f"https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-hive3/{_ICEBERG_VERSION}/{_ICEBERG_HIVE_FILENAME}" -_HIVE_VERSION = "2.3.9" +_HIVE_VERSION = "3.1.3" _HIVE_FILENAME = f"apache-hive-{_HIVE_VERSION}-bin" _HIVE_URL = ( f"https://archive.apache.org/dist/hive/hive-{_HIVE_VERSION}/{_HIVE_FILENAME}.tar.gz" ) +_NESSIE_VERSION = "0.74.0" + -def _link_file_into_dir(source_file: str, target_dir: str, replace_if_exists=True) -> None: +def _link_file_into_dir( + source_file: str, target_dir: str, replace_if_exists=True +) -> None: assert os.path.isfile(source_file) assert os.path.isdir(target_dir) @@ -75,7 +81,7 @@ def _link_file_into_dir(source_file: str, target_dir: str, replace_if_exists=Tru os.link(source_file, target_file) assert os.path.isfile(target_file), (source_file, target_file) - action = 'replaced' if replaced else 'created' + action = "replaced" if replaced else "created" print(f"Link target was {action}: {target_file} (source: {source_file})") @@ -112,7 +118,9 @@ def _copy_all_hadoop_jars_to_pyflink() -> None: pyflink_lib_dir = _find_pyflink_lib_dir() for _jar_count, jar in enumerate(_jar_files()): _link_file_into_dir(jar, pyflink_lib_dir) - print(f"Linked {_jar_count} HADOOP jar files into the pyflink lib dir at location {pyflink_lib_dir}") + print( + f"Linked {_jar_count} HADOOP jar files into the pyflink lib dir at location {pyflink_lib_dir}" + ) def _find_pyflink_lib_dir() -> Optional[str]: @@ -139,16 +147,6 @@ def _download_file(filename: str, url: str) -> None: f.write(r.content) -def fetch_nessie() -> str: - """Download nessie executable.""" - runner = "nessie-quarkus-runner" - - url = _get_base_nessie_url() - _download_file(runner, url) - os.chmod(runner, os.stat(runner).st_mode | stat.S_IXUSR) - return runner - - def fetch_nessie_jar() -> str: """Download nessie Jar in order to run the tests in Mac""" runner = "nessie-quarkus-runner.jar" @@ -159,12 +157,8 @@ def fetch_nessie_jar() -> str: def _get_base_nessie_url() -> str: - import pynessie - - version = pynessie.__version__ - return "https://github.com/projectnessie/nessie/releases/download/nessie-{}/nessie-quarkus-{}-runner".format( - version, version + _NESSIE_VERSION, _NESSIE_VERSION ) diff --git a/notebooks/nessie-iceberg-demo-nba.ipynb b/notebooks/nessie-iceberg-demo-nba.ipynb index 7b2632a0..2b540500 100644 --- a/notebooks/nessie-iceberg-demo-nba.ipynb +++ b/notebooks/nessie-iceberg-demo-nba.ipynb @@ -26,68 +26,7 @@ "name": "#%%\n" } }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: An illegal reflective access operation has occurred\n", - "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", - "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", - "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", - "WARNING: All illegal access operations will be denied in a future release\n", - "Ivy Default Cache set to: /home/jovyan/.ivy2/cache\n", - "The jars for the packages stored in: /home/jovyan/.ivy2/jars\n", - "org.apache.iceberg#iceberg-spark-runtime-3.2_2.12 added as a dependency\n", - "org.projectnessie#nessie-spark-3.2-extensions added as a dependency\n", - ":: resolving dependencies :: org.apache.spark#spark-submit-parent-6cba98e4-6e15-458e-a366-568683d289f7;1.0\n", - "\tconfs: [default]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - ":: loading settings :: url = jar:file:/home/jovyan/spark-3.2.1-bin-hadoop3.2/jars/ivy-2.5.0.jar!/org/apache/ivy/core/settings/ivysettings.xml\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\tfound org.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.13.1 in central\n", - "\tfound org.projectnessie#nessie-spark-3.2-extensions;0.30.0 in central\n", - "downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/0.13.1/iceberg-spark-runtime-3.2_2.12-0.13.1.jar ...\n", - "\t[SUCCESSFUL ] org.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.13.1!iceberg-spark-runtime-3.2_2.12.jar (1331ms)\n", - "downloading https://repo1.maven.org/maven2/org/projectnessie/nessie-spark-3.2-extensions/0.30.0/nessie-spark-3.2-extensions-0.30.0.jar ...\n", - "\t[SUCCESSFUL ] org.projectnessie#nessie-spark-3.2-extensions;0.30.0!nessie-spark-3.2-extensions.jar (70ms)\n", - ":: resolution report :: resolve 13309ms :: artifacts dl 1405ms\n", - "\t:: modules in use:\n", - "\torg.apache.iceberg#iceberg-spark-runtime-3.2_2.12;0.13.1 from central in [default]\n", - "\torg.projectnessie#nessie-spark-3.2-extensions;0.30.0 from central in [default]\n", - "\t---------------------------------------------------------------------\n", - "\t| | modules || artifacts |\n", - "\t| conf | number| search|dwnlded|evicted|| number|dwnlded|\n", - "\t---------------------------------------------------------------------\n", - "\t| default | 2 | 2 | 2 | 0 || 2 | 2 |\n", - "\t---------------------------------------------------------------------\n", - ":: retrieving :: org.apache.spark#spark-submit-parent-6cba98e4-6e15-458e-a366-568683d289f7\n", - "\tconfs: [default]\n", - "\t2 artifacts copied, 0 already retrieved (22360kB/20ms)\n", - "22/05/24 07:43:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", - "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", - "Setting default log level to \"WARN\".\n", - "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Spark Running\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import findspark\n", @@ -102,7 +41,7 @@ "# we need iceberg libraries and the nessie sql extensions\n", "conf.set(\n", " \"spark.jars.packages\",\n", - " f\"org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.1,org.projectnessie:nessie-spark-3.2-extensions:{pynessie_version}\",\n", + " f\"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.4.2,org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.74.0\",\n", ")\n", "# ensure python <-> java interactions are w/ pyarrow\n", "conf.set(\"spark.sql.execution.pyarrow.enabled\", \"true\")\n", @@ -121,7 +60,7 @@ "# enable the extensions for both Nessie and Iceberg\n", "conf.set(\n", " \"spark.sql.extensions\",\n", - " \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSpark32SessionExtensions\",\n", + " \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions\",\n", ")\n", "# finally, start up the Spark server\n", "spark = SparkSession.builder.config(conf=conf).getOrCreate()\n", @@ -158,55 +97,10 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchdev2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ + "spark.sql(\"CREATE NAMESPACE dev_catalog.nba\")\n", + "\n", "spark.sql(\"CREATE BRANCH dev IN dev_catalog FROM main\").toPandas()" ] }, @@ -223,61 +117,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchdev2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
1Branchmain2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...\n", - "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" ] @@ -307,46 +147,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"USE REFERENCE dev IN dev_catalog\")\n", "\n", @@ -387,23 +188,11 @@ "cell_type": "code", "execution_count": null, "metadata": { - "jupyter": { - "outputs_hidden": false - }, "pycharm": { "name": "#%%\n" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n", - "92\n" - ] - } - ], + "outputs": [], "source": [ "table_count = spark.sql(\"select count(*) from dev_catalog.nba.`salaries@dev`\").toPandas().values[0][0]\n", "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n", @@ -430,56 +219,11 @@ "cell_type": "code", "execution_count": null, "metadata": { - "jupyter": { - "outputs_hidden": false - }, "pycharm": { "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [namespace, tableName, isTemporary]\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" @@ -496,61 +240,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbatotals_statsFalse
1nbasalariesFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba totals_stats False\n", - "1 nba salaries False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"USE REFERENCE dev IN dev_catalog\").toPandas()\n", "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" @@ -571,61 +261,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchdev70a8df769b477de5b9157691edef1efca8a640ae9f7137...
1Branchmain2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch dev 70a8df769b477de5b9157691edef1efca8a640ae9f7137...\n", - "1 Branch main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616a..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" ] @@ -645,52 +281,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namehash
0mainaf5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38...
\n", - "
" - ], - "text/plain": [ - " name hash\n", - "0 main af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"MERGE BRANCH dev INTO main IN dev_catalog\").toPandas()" ] @@ -708,61 +299,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchmainaf5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38...
1Branchdev70a8df769b477de5b9157691edef1efca8a640ae9f7137...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch main af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38...\n", - "1 Branch dev 70a8df769b477de5b9157691edef1efca8a640ae9f7137..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" ] @@ -771,61 +308,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbatotals_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba totals_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" @@ -835,16 +318,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "50\n", - "92\n" - ] - } - ], + "outputs": [], "source": [ "table_count = spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas().values[0][0]\n", "csv_count = spark.sql(\"select count(*) from salaries_table\").toPandas().values[0][0]\n", @@ -880,54 +354,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchetlaf5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch etl af5e8edd1b769f3840ee485c0c6fc6aeaaebeb10d6ad38..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"CREATE BRANCH etl IN dev_catalog FROM main\").toPandas()" ] @@ -940,46 +367,7 @@ "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# add some salaries for Kevin Durant\n", "spark.sql(\"USE REFERENCE etl IN dev_catalog\")\n", @@ -997,46 +385,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Dropping a column in the `totals_stats` table\n", "spark.sql(\"ALTER TABLE dev_catalog.nba.totals_stats DROP COLUMN Age\").toPandas()" @@ -1046,50 +395,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count(1)
047
\n", - "
" - ], - "text/plain": [ - " count(1)\n", - "0 47" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Creating `allstar_games_stats` table and viewing the contents\n", "spark.sql(\n", @@ -1119,61 +425,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbatotals_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba totals_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" @@ -1183,68 +435,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbaallstar_games_statsFalse
1nbatotals_statsFalse
2nbasalariesFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba allstar_games_stats False\n", - "1 nba totals_stats False\n", - "2 nba salaries False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"USE REFERENCE etl IN dev_catalog\").toPandas()\n", "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" @@ -1261,59 +452,11 @@ "cell_type": "code", "execution_count": null, "metadata": { - "jupyter": { - "outputs_hidden": false - }, "pycharm": { "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namehash
0maine0f4167c8947d15161a8fedc376da020f1ecff63c5eea2...
\n", - "
" - ], - "text/plain": [ - " name hash\n", - "0 main e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"MERGE BRANCH etl INTO main IN dev_catalog\").toPandas()" ] @@ -1329,75 +472,11 @@ "cell_type": "code", "execution_count": null, "metadata": { - "jupyter": { - "outputs_hidden": false - }, "pycharm": { "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbaallstar_games_statsFalse
2nbatotals_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba allstar_games_stats False\n", - "2 nba totals_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" @@ -1407,68 +486,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchmaine0f4167c8947d15161a8fedc376da020f1ecff63c5eea2...
1Branchetl957c1254ab0a3e3bd1e306669ebe7073e27a97966bcfda...
2Branchdev70a8df769b477de5b9157691edef1efca8a640ae9f7137...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch main e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2...\n", - "1 Branch etl 957c1254ab0a3e3bd1e306669ebe7073e27a97966bcfda...\n", - "2 Branch dev 70a8df769b477de5b9157691edef1efca8a640ae9f7137..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"LIST REFERENCES IN dev_catalog\").toPandas()" ] @@ -1477,15 +495,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "47\n" - ] - } - ], + "outputs": [], "source": [ "table_count = spark.sql(\"select count(*) from dev_catalog.nba.allstar_games_stats\").toPandas().values[0][0]\n", "csv_count = spark.sql(\"select count(*) from allstar_table\").toPandas().values[0][0]\n", @@ -1511,54 +521,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
refTypenamehash
0Branchexperimente0f4167c8947d15161a8fedc376da020f1ecff63c5eea2...
\n", - "
" - ], - "text/plain": [ - " refType name hash\n", - "0 Branch experiment e0f4167c8947d15161a8fedc376da020f1ecff63c5eea2..." - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"CREATE BRANCH experiment IN dev_catalog FROM main\").toPandas()\n", "spark.sql(\"USE REFERENCE experiment IN dev_catalog\").toPandas()" @@ -1568,46 +531,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# Drop the `totals_stats` table on the `experiment` branch\n", "spark.sql(\"DROP TABLE dev_catalog.nba.totals_stats\").toPandas()" @@ -1617,46 +541,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "
" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: []\n", - "Index: []" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "# add some salaries for Dirk Nowitzki\n", "spark.sql(\n", @@ -1673,61 +558,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbaallstar_games_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba allstar_games_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" ] @@ -1736,75 +567,11 @@ "cell_type": "code", "execution_count": null, "metadata": { - "jupyter": { - "outputs_hidden": false - }, "pycharm": { "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
namespacetableNameisTemporary
0nbasalariesFalse
1nbaallstar_games_statsFalse
2nbatotals_statsFalse
\n", - "
" - ], - "text/plain": [ - " namespace tableName isTemporary\n", - "0 nba salaries False\n", - "1 nba allstar_games_stats False\n", - "2 nba totals_stats False" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"USE REFERENCE main IN dev_catalog\").toPandas()\n", "spark.sql(\"SHOW TABLES IN dev_catalog\").toPandas()" @@ -1822,50 +589,7 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count(1)
058
\n", - "
" - ], - "text/plain": [ - " count(1)\n", - "0 58" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"select count(*) from dev_catalog.nba.`salaries@experiment`\").toPandas()" ] @@ -1886,50 +610,7 @@ "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
count(1)
054
\n", - "
" - ], - "text/plain": [ - " count(1)\n", - "0 54" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"select count(*) from dev_catalog.nba.salaries\").toPandas()" ] @@ -1953,18 +634,7 @@ "name": "#%%\n" } }, - "outputs": [ - { - "data": { - "text/plain": [ - "DataFrame[status: string]" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "spark.sql(\"DROP BRANCH dev IN dev_catalog\")\n", "spark.sql(\"DROP BRANCH etl IN dev_catalog\")\n", @@ -1993,4 +663,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/notebooks/nessie-iceberg-flink-demo-nba.ipynb b/notebooks/nessie-iceberg-flink-demo-nba.ipynb index 16646608..3705d3ce 100644 --- a/notebooks/nessie-iceberg-flink-demo-nba.ipynb +++ b/notebooks/nessie-iceberg-flink-demo-nba.ipynb @@ -1,2014 +1,740 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Nessie Iceberg/Flink SQL Demo with NBA Dataset\n", - "============================\n", - "This demo showcases how to use Nessie Python API along with Flink from Iceberg\n", - "\n", - "Initialize PyFlink\n", - "----------------------------------------------\n", - "To get started, we will first have to do a few setup steps that give us everything we need\n", - "to get started with Nessie. In case you're interested in the detailed setup steps for Flink, you can check out the [docs](https://projectnessie.org/tools/iceberg/flink/)\n", - "\n", - "The Binder server has downloaded flink and some data for us as well as started a Nessie server in the background. All we have to do is start Flink\n", - "\n", - "The below cell starts a local Flink session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "SLF4J: Class path contains multiple SLF4J bindings.\n", - "SLF4J: Found binding in [jar:file:/srv/conda/envs/flink-demo/lib/python3.7/site-packages/pyflink/lib/log4j-slf4j-impl-2.17.1.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n", - "SLF4J: Found binding in [jar:file:/srv/conda/envs/flink-demo/lib/python3.7/site-packages/pyflink/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]\n", - "SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.\n", - "SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n", - "Flink running\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "import os\n", - "from pyflink.datastream import StreamExecutionEnvironment\n", - "from pyflink.table import StreamTableEnvironment\n", - "from pyflink.table.expressions import lit\n", - "from pynessie import init\n", - "\n", - "# where we will store our data\n", - "warehouse = os.path.join(os.getcwd(), \"flink-warehouse\")\n", - "# this was downloaded when Binder started, its available on maven central\n", - "iceberg_flink_runtime_jar = os.path.join(os.getcwd(), \"../iceberg-flink-runtime-1.13-0.13.1.jar\")\n", - "assert os.path.exists(iceberg_flink_runtime_jar)\n", - "\n", - "env = StreamExecutionEnvironment.get_execution_environment()\n", - "env.add_jars(\"file://{}\".format(iceberg_flink_runtime_jar))\n", - "table_env = StreamTableEnvironment.create(env)\n", - "\n", - "nessie_client = init()\n", - "\n", - "\n", - "def create_ref_catalog(ref):\n", - " \"\"\"\n", - " Create a flink catalog that is tied to a specific ref.\n", - "\n", - " In order to create the catalog we have to first create the branch\n", - " \"\"\"\n", - " default_branch = nessie_client.get_default_branch()\n", - " if ref != default_branch:\n", - " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n", - " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n", - " # The important args below are:\n", - " # type - tell Flink to use Iceberg as the catalog\n", - " # catalog-impl - which Iceberg catalog to use, in this case we want Nessie\n", - " # uri - the location of the nessie server.\n", - " # ref - the Nessie ref/branch we want to use (defaults to main)\n", - " # warehouse - the location this catalog should store its data\n", - " table_env.execute_sql(\n", - " f\"\"\"CREATE CATALOG {ref}_catalog WITH (\n", - " 'type'='iceberg',\n", - " 'catalog-impl'='org.apache.iceberg.nessie.NessieCatalog',\n", - " 'uri'='http://localhost:19120/api/v1',\n", - " 'ref'='{ref}',\n", - " 'warehouse' = '{warehouse}')\"\"\"\n", - " )\n", - "\n", - "\n", - "create_ref_catalog(nessie_client.get_default_branch())\n", - "print(\"\\n\\n\\nFlink running\\n\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Solving Data Engineering problems with Nessie\n", - "============================\n", - "\n", - "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", - "\n", - "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set up Nessie branches (via Nessie CLI)\n", - "----------------------------\n", - "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", - "\n", - "- Create a new branch named `dev`\n", - "- List all branches\n", - "\n", - "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "create_ref_catalog(\"dev\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", - "\n", - "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[0m\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tables under dev branch\n", - "-------------------------------------\n", - "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", - "\n", - "We create two tables under the `dev` branch:\n", - "- `salaries`\n", - "- `totals_stats`\n", - "\n", - "These tables list the salaries per player per year and their stats per year.\n", - "\n", - "To create the data we:\n", - "\n", - "1. switch our branch context to dev\n", - "2. create the table\n", - "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING: An illegal reflective access operation has occurred\n", - "WARNING: Illegal reflective access by org.apache.hadoop.security.authentication.util.KerberosUtil (file:/srv/conda/envs/flink-demo/lib/python3.7/site-packages/pyflink/lib/hadoop-auth-2.10.1.jar) to method sun.security.krb5.Config.getInstance()\n", - "WARNING: Please consider reporting this to the maintainers of org.apache.hadoop.security.authentication.util.KerberosUtil\n", - "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", - "WARNING: All illegal access operations will be denied in a future release\n", - "log4j:WARN No appenders could be found for logger (org.apache.htrace.core.Tracer).\n", - "log4j:WARN Please initialize the log4j system properly.\n", - "log4j:WARN See http://logging.apache.org/log4j/1.2/faq.html#noconfig for more info.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,465 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,465 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,464 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:58,465 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,663 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,664 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,664 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,665 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:44:59,665 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "\n", - "\n", - "\n", - "Added 51 rows to the salaries table and 93 rows to the totals_stats table.\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "# Load the dataset\n", - "from pyflink.table import DataTypes\n", - "from pyflink.table.descriptors import Schema, OldCsv, FileSystem\n", - "\n", - "# Creating `salaries` table\n", - "(\n", - " table_env.connect(FileSystem().path(\"../datasets/nba/salaries.csv\"))\n", - " .with_format(\n", - " OldCsv()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"Salary\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " )\n", - " .with_schema(\n", - " Schema()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"Salary\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " )\n", - " .create_temporary_table(\"dev_catalog.nba.salaries_temp\")\n", - ")\n", - "\n", - "table_env.execute_sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.salaries\n", - " (Season STRING, Team STRING, Salary STRING, Player STRING)\"\"\"\n", - ").wait()\n", - "\n", - "tab = table_env.from_path(\"dev_catalog.nba.salaries_temp\")\n", - "tab.execute_insert(\"dev_catalog.nba.salaries\").wait()\n", - "\n", - "# Creating `totals_stats` table\n", - "(\n", - " table_env.connect(FileSystem().path(\"../datasets/nba/totals_stats.csv\"))\n", - " .with_format(\n", - " OldCsv()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Age\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"ORB\", DataTypes.STRING())\n", - " .field(\"DRB\", DataTypes.STRING())\n", - " .field(\"TRB\", DataTypes.STRING())\n", - " .field(\"AST\", DataTypes.STRING())\n", - " .field(\"STL\", DataTypes.STRING())\n", - " .field(\"BLK\", DataTypes.STRING())\n", - " .field(\"TOV\", DataTypes.STRING())\n", - " .field(\"PTS\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " .field(\"RSorPO\", DataTypes.STRING())\n", - " )\n", - " .with_schema(\n", - " Schema()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Age\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"ORB\", DataTypes.STRING())\n", - " .field(\"DRB\", DataTypes.STRING())\n", - " .field(\"TRB\", DataTypes.STRING())\n", - " .field(\"AST\", DataTypes.STRING())\n", - " .field(\"STL\", DataTypes.STRING())\n", - " .field(\"BLK\", DataTypes.STRING())\n", - " .field(\"TOV\", DataTypes.STRING())\n", - " .field(\"PTS\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " .field(\"RSorPO\", DataTypes.STRING())\n", - " )\n", - " .create_temporary_table(\"dev_catalog.nba.totals_stats_temp\")\n", - ")\n", - "\n", - "table_env.execute_sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.totals_stats (Season STRING, Age STRING, Team STRING,\n", - " ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING, PTS STRING,\n", - " Player STRING, RSorPO STRING)\"\"\"\n", - ").wait()\n", - "\n", - "tab = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\")\n", - "tab.execute_insert(\"dev_catalog.nba.totals_stats\").wait()\n", - "\n", - "salaries = table_env.from_path(\"main_catalog.nba.`salaries@dev`\").select(lit(1).count).to_pandas().values[0][0]\n", - "totals_stats = table_env.from_path(\"main_catalog.nba.`totals_stats@dev`\").select(lit(1).count).to_pandas().values[0][0]\n", - "print(f\"\\n\\n\\nAdded {salaries} rows to the salaries table and {totals_stats} rows to the totals_stats table.\\n\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by the catalog." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:04,807 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,869 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,872 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,874 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,876 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,879 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,881 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:04,883 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "51\n", - "2022-05-24 07:45:06,280 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,344 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,347 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,351 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,354 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,357 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,360 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:06,364 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "93\n" - ] - } - ], - "source": [ - "table_count = table_env.from_path(\"dev_catalog.nba.`salaries@dev`\").select(\"Season.count\").to_pandas().values[0][0]\n", - "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "table_count = table_env.from_path(\"dev_catalog.nba.`totals_stats@dev`\").select(\"Season.count\").to_pandas().values[0][0]\n", - "csv_count = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check generated tables\n", - "----------------------------\n", - "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", - "let's verify that the `main` branch was not altered by our changes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "And on the `dev` branch we expect to see two tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.totals_stats\n", - "\tnba.salaries\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list --ref dev" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can also verify that the `dev` and `main` branches point to different commits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dev f48b93594ddead3a7616a271d657f0fff97cd0c4c04d4a579fa165aa96a69908\n", - "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[0m\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dev promotion into main\n", - "-----------------------\n", - "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", - "We merge `dev` into `main` via the command line `merge` command.\n", - "Both branches should be at the same revision after merging/promotion." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie merge dev -b main --force" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can verify that the `main` branch now contains the expected tables and row counts.\n", - "\n", - "The tables are now on `main` and ready for consumption by our blog authors and analysts!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33m* main facfd43be1d062734ca0cda5ae900dde398180bf3f370a19627da8a2419589b0\n", - "\u001b[0m dev f48b93594ddead3a7616a271d657f0fff97cd0c4c04d4a579fa165aa96a69908\n", - "\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:10,661 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,724 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,725 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,727 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,729 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,730 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,732 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:10,733 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,239 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,304 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,307 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,312 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,316 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,319 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,322 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:12,326 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n" - ] - } - ], - "source": [ - "table_count = table_env.from_path(\"main_catalog.nba.salaries\").select(\"Season.count\").to_pandas().values[0][0]\n", - "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - "assert table_count == csv_count\n", - "\n", - "table_count = table_env.from_path(\"main_catalog.nba.totals_stats\").select(\"Season.count\").to_pandas().values[0][0]\n", - "csv_count = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - "assert table_count == csv_count" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Perform regular ETL on the new tables\n", - "-------------------\n", - "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", - "\n", - "1. Update the salaries table to add new data\n", - "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", - "3. We create a new table to hold information about the players appearances in all star games\n", - "\n", - "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "create_ref_catalog(\"etl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:13,368 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n" - ] - } - ], - "source": [ - "# add some salaries for Kevin Durant\n", - "table_env.execute_sql(\n", - " \"\"\"INSERT INTO etl_catalog.nba.salaries\n", - " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n", - " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n", - " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n", - " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n", - ").wait()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Rename the table `totals_stats` to `new_totals_stats`\n", - "table_env.execute_sql(\"ALTER TABLE etl_catalog.nba.totals_stats RENAME TO etl_catalog.nba.new_totals_stats\").wait()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:14,227 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n", - "2022-05-24 07:45:15,480 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,543 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,546 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,549 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,551 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,554 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,557 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:15,560 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SeasonAgeTeamORBTRBASTSTLBLKTOVPFPTSPlayer
02004-0526LAL367314516Kobe Bryant
12005-0627LAL07830358Kobe Bryant
22006-0728LAL156604131Kobe Bryant
32007-0829LAL01000000Kobe Bryant
42008-0930LAL144401027Kobe Bryant
52009-1031LALKobe Bryant
62009-1025CLE156402125Lebron James
72010-1126MIA21210004329Lebron James
82011-1227MIA067004236Lebron James
92012-1328MIA035104019Lebron James
102013-1429MIA177301022Lebron James
112014-1530CLE157204130Lebron James
122010-1132LAL10143304237Kobe Bryant
132011-1233LAL011201227Kobe Bryant
142012-1334LAL24822129Kobe Bryant
152013-1435LALKobe Bryant
162014-1536LALKobe Bryant
172015-1637LAL167101110Kobe Bryant
181997-9819LAL261201118Kobe Bryant
191999-0021LAL113201315Kobe Bryant
202000-0122LAL247103319Kobe Bryant
212001-0223LAL255100231Kobe Bryant
222002-0324LAL276325522Kobe Bryant
232003-0425LAL144516320Kobe Bryant
24SeasonAgeTeamORBTRBASTSTLBLKTOVPFPTSPlayer
252004-0520CLE186203013Lebron James
262005-0621CLE262201229Lebron James
272006-0722CLE066104028Lebron James
282007-0823CLE189224327Lebron James
292008-0924CLE053003020Lebron James
301992-9329CHI345406530Michael Jordan
311995-9632CHI141100120Michael Jordan
321996-9733CHI31111203414Michael Jordan
331997-9834CHI168302023Michael Jordan
342001-0238WAS04320118Michael Jordan
352002-0339WAS252202320Michael Jordan
362015-1631CLE047004013Lebron James
372016-1732CLE031004223Lebron James
382017-1833CLE0108105229Lebron James
392018-1934LAL284021119Lebron James
401984-8521CHI36231147Michael Jordan
411985-8622CHIMichael Jordan
421986-8723CHI004205211Michael Jordan
431987-8824CHI383442540Michael Jordan
441988-8925CHI123504128Michael Jordan
451989-9026CHI152515117Michael Jordan
461990-9127CHI3552010226Michael Jordan
471991-9228CHI115201218Michael Jordan
\n", - "
" - ], - "text/plain": [ - " Season Age Team ORB TRB AST STL BLK TOV PF PTS Player\n", - "0 2004-05 26 LAL 3 6 7 3 1 4 5 16 Kobe Bryant\n", - "1 2005-06 27 LAL 0 7 8 3 0 3 5 8 Kobe Bryant\n", - "2 2006-07 28 LAL 1 5 6 6 0 4 1 31 Kobe Bryant\n", - "3 2007-08 29 LAL 0 1 0 0 0 0 0 0 Kobe Bryant\n", - "4 2008-09 30 LAL 1 4 4 4 0 1 0 27 Kobe Bryant\n", - "5 2009-10 31 LAL Kobe Bryant\n", - "6 2009-10 25 CLE 1 5 6 4 0 2 1 25 Lebron James\n", - "7 2010-11 26 MIA 2 12 10 0 0 4 3 29 Lebron James\n", - "8 2011-12 27 MIA 0 6 7 0 0 4 2 36 Lebron James\n", - "9 2012-13 28 MIA 0 3 5 1 0 4 0 19 Lebron James\n", - "10 2013-14 29 MIA 1 7 7 3 0 1 0 22 Lebron James\n", - "11 2014-15 30 CLE 1 5 7 2 0 4 1 30 Lebron James\n", - "12 2010-11 32 LAL 10 14 3 3 0 4 2 37 Kobe Bryant\n", - "13 2011-12 33 LAL 0 1 1 2 0 1 2 27 Kobe Bryant\n", - "14 2012-13 34 LAL 2 4 8 2 2 1 2 9 Kobe Bryant\n", - "15 2013-14 35 LAL Kobe Bryant\n", - "16 2014-15 36 LAL Kobe Bryant\n", - "17 2015-16 37 LAL 1 6 7 1 0 1 1 10 Kobe Bryant\n", - "18 1997-98 19 LAL 2 6 1 2 0 1 1 18 Kobe Bryant\n", - "19 1999-00 21 LAL 1 1 3 2 0 1 3 15 Kobe Bryant\n", - "20 2000-01 22 LAL 2 4 7 1 0 3 3 19 Kobe Bryant\n", - "21 2001-02 23 LAL 2 5 5 1 0 0 2 31 Kobe Bryant\n", - "22 2002-03 24 LAL 2 7 6 3 2 5 5 22 Kobe Bryant\n", - "23 2003-04 25 LAL 1 4 4 5 1 6 3 20 Kobe Bryant\n", - "24 Season Age Team ORB TRB AST STL BLK TOV PF PTS Player\n", - "25 2004-05 20 CLE 1 8 6 2 0 3 0 13 Lebron James\n", - "26 2005-06 21 CLE 2 6 2 2 0 1 2 29 Lebron James\n", - "27 2006-07 22 CLE 0 6 6 1 0 4 0 28 Lebron James\n", - "28 2007-08 23 CLE 1 8 9 2 2 4 3 27 Lebron James\n", - "29 2008-09 24 CLE 0 5 3 0 0 3 0 20 Lebron James\n", - "30 1992-93 29 CHI 3 4 5 4 0 6 5 30 Michael Jordan\n", - "31 1995-96 32 CHI 1 4 1 1 0 0 1 20 Michael Jordan\n", - "32 1996-97 33 CHI 3 11 11 2 0 3 4 14 Michael Jordan\n", - "33 1997-98 34 CHI 1 6 8 3 0 2 0 23 Michael Jordan\n", - "34 2001-02 38 WAS 0 4 3 2 0 1 1 8 Michael Jordan\n", - "35 2002-03 39 WAS 2 5 2 2 0 2 3 20 Michael Jordan\n", - "36 2015-16 31 CLE 0 4 7 0 0 4 0 13 Lebron James\n", - "37 2016-17 32 CLE 0 3 1 0 0 4 2 23 Lebron James\n", - "38 2017-18 33 CLE 0 10 8 1 0 5 2 29 Lebron James\n", - "39 2018-19 34 LAL 2 8 4 0 2 1 1 19 Lebron James\n", - "40 1984-85 21 CHI 3 6 2 3 1 1 4 7 Michael Jordan\n", - "41 1985-86 22 CHI Michael Jordan\n", - "42 1986-87 23 CHI 0 0 4 2 0 5 2 11 Michael Jordan\n", - "43 1987-88 24 CHI 3 8 3 4 4 2 5 40 Michael Jordan\n", - "44 1988-89 25 CHI 1 2 3 5 0 4 1 28 Michael Jordan\n", - "45 1989-90 26 CHI 1 5 2 5 1 5 1 17 Michael Jordan\n", - "46 1990-91 27 CHI 3 5 5 2 0 10 2 26 Michael Jordan\n", - "47 1991-92 28 CHI 1 1 5 2 0 1 2 18 Michael Jordan" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Creating `allstar_games_stats` table\n", - "(\n", - " table_env.connect(FileSystem().path(\"../datasets/nba/allstar_games_stats.csv\"))\n", - " .with_format(\n", - " OldCsv()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Age\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"ORB\", DataTypes.STRING())\n", - " .field(\"TRB\", DataTypes.STRING())\n", - " .field(\"AST\", DataTypes.STRING())\n", - " .field(\"STL\", DataTypes.STRING())\n", - " .field(\"BLK\", DataTypes.STRING())\n", - " .field(\"TOV\", DataTypes.STRING())\n", - " .field(\"PF\", DataTypes.STRING())\n", - " .field(\"PTS\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " )\n", - " .with_schema(\n", - " Schema()\n", - " .field(\"Season\", DataTypes.STRING())\n", - " .field(\"Age\", DataTypes.STRING())\n", - " .field(\"Team\", DataTypes.STRING())\n", - " .field(\"ORB\", DataTypes.STRING())\n", - " .field(\"TRB\", DataTypes.STRING())\n", - " .field(\"AST\", DataTypes.STRING())\n", - " .field(\"STL\", DataTypes.STRING())\n", - " .field(\"BLK\", DataTypes.STRING())\n", - " .field(\"TOV\", DataTypes.STRING())\n", - " .field(\"PF\", DataTypes.STRING())\n", - " .field(\"PTS\", DataTypes.STRING())\n", - " .field(\"Player\", DataTypes.STRING())\n", - " )\n", - " .create_temporary_table(\"etl_catalog.nba.allstar_games_stats_temp\")\n", - ")\n", - "\n", - "table_env.execute_sql(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS etl_catalog.nba.allstar_games_stats (Season STRING, Age STRING,\n", - " Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING,\n", - " PF STRING, PTS STRING, Player STRING)\"\"\"\n", - ").wait()\n", - "\n", - "tab = table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\")\n", - "tab.execute_insert(\"etl_catalog.nba.allstar_games_stats\").wait()\n", - "\n", - "# Notice how we view the data on the etl branch via @etl\n", - "table_env.from_path(\"etl_catalog.nba.`allstar_games_stats@etl`\").to_pandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can verify that the new table isn't on the `main` branch but is present on the etl branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n", - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.allstar_games_stats\n", - "\tnba.new_totals_stats\n", - "\tnba.salaries\n", - "\n" - ] - } - ], - "source": [ - "# We should see `allstar_games_stats` and the `new_totals_stats` on the `etl` branch\n", - "!nessie content list --ref etl" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we are happy with the data we can again merge it into `main`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie merge etl -b main --force" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Now lets verify that the changes exist on the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.new_totals_stats\n", - "\tnba.allstar_games_stats\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33m* main 720543fa3a9579d0bfee11e07f383d86468eb4d73dc207e5bd6ef7f76b000930\n", - "\u001b[0m etl c962d80b04ee619a6a0670cb5f664d948c86f6ebf66435027c5abe761e920c9e\n", - " dev f48b93594ddead3a7616a271d657f0fff97cd0c4c04d4a579fa165aa96a69908\n", - "\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:19,196 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,257 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,260 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,263 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,265 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,268 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,270 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n", - "2022-05-24 07:45:19,273 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new decompressor [.gz]\n" - ] - } - ], - "source": [ - "table_count = (\n", - " table_env.from_path(\"main_catalog.nba.allstar_games_stats\").select(\"Season.count\").to_pandas().values[0][0]\n", - ")\n", - "csv_count = (\n", - " table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", - ")\n", - "assert table_count == csv_count" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create `experiment` branch\n", - "--------------------------------\n", - "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", - "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", - "and carry out our experiment, which could consist of the following steps:\n", - "- drop `totals_stats` table\n", - "- add data to `salaries` table\n", - "- compare `experiment` and `main` tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "create_ref_catalog(\"experiment\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Drop the `new_totals_stats` table on the `experiment` branch\n", - "table_env.execute_sql(\"DROP TABLE experiment_catalog.nba.new_totals_stats\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2022-05-24 07:45:20,258 INFO org.apache.hadoop.io.compress.CodecPool [] - Got brand-new compressor [.gz]\n" - ] - } - ], - "source": [ - "# add some salaries for Dirk Nowitzki\n", - "table_env.execute_sql(\n", - " \"\"\"INSERT INTO experiment_catalog.nba.salaries VALUES\n", - " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n", - " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n", - " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n", - " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", - ").wait()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.allstar_games_stats\n", - "\n" - ] - } - ], - "source": [ - "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `new_totals_stats`)\n", - "!nessie content list --ref experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.new_totals_stats\n", - "\tnba.allstar_games_stats\n", - "\n" - ] - } - ], - "source": [ - "# `main` hasn't changed been changed and still has the `new_totals_stats` table\n", - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n", - "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EXPR$0
059
\n", - "
" - ], - "text/plain": [ - " EXPR$0\n", - "0 59" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table_env.from_path(\"main_catalog.nba.`salaries@experiment`\").select(lit(1).count).to_pandas()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n", - "to the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
EXPR$0
055
\n", - "
" - ], - "text/plain": [ - " EXPR$0\n", - "0 55" - ] - }, - "execution_count": null, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "table_env.from_path(\"main_catalog.nba.`salaries@main`\").select(lit(1).count).to_pandas()" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Nessie Iceberg/Flink SQL Demo with NBA Dataset\n", + "============================\n", + "This demo showcases how to use Nessie Python API along with Flink from Iceberg\n", + "\n", + "Initialize PyFlink\n", + "----------------------------------------------\n", + "To get started, we will first have to do a few setup steps that give us everything we need\n", + "to get started with Nessie. In case you're interested in the detailed setup steps for Flink, you can check out the [docs](https://projectnessie.org/tools/iceberg/flink/)\n", + "\n", + "The Binder server has downloaded flink and some data for us as well as started a Nessie server in the background. All we have to do is start Flink\n", + "\n", + "The below cell starts a local Flink session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "import os\n", + "from pyflink.datastream import StreamExecutionEnvironment\n", + "from pyflink.table import StreamTableEnvironment\n", + "from pyflink.table.expressions import lit\n", + "from pynessie import init\n", + "\n", + "# where we will store our data\n", + "warehouse = os.path.join(os.getcwd(), \"flink-warehouse\")\n", + "# this was downloaded when Binder started, its available on maven central\n", + "iceberg_flink_runtime_jar = os.path.join(os.getcwd(), \"../iceberg-flink-runtime-1.17-1.4.2.jar\")\n", + "assert os.path.exists(iceberg_flink_runtime_jar)\n", + "\n", + "env = StreamExecutionEnvironment.get_execution_environment()\n", + "env.add_jars(\"file://{}\".format(iceberg_flink_runtime_jar))\n", + "table_env = StreamTableEnvironment.create(env)\n", + "\n", + "nessie_client = init()\n", + "\n", + "\n", + "def create_ref_catalog(ref):\n", + " \"\"\"\n", + " Create a flink catalog that is tied to a specific ref.\n", + "\n", + " In order to create the catalog we have to first create the branch\n", + " \"\"\"\n", + " default_branch = nessie_client.get_default_branch()\n", + " if ref != default_branch:\n", + " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n", + " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n", + " # The important args below are:\n", + " # type - tell Flink to use Iceberg as the catalog\n", + " # catalog-impl - which Iceberg catalog to use, in this case we want Nessie\n", + " # uri - the location of the nessie server.\n", + " # ref - the Nessie ref/branch we want to use (defaults to main)\n", + " # warehouse - the location this catalog should store its data\n", + " table_env.execute_sql(\n", + " f\"\"\"CREATE CATALOG {ref}_catalog WITH (\n", + " 'type'='iceberg',\n", + " 'catalog-impl'='org.apache.iceberg.nessie.NessieCatalog',\n", + " 'uri'='http://localhost:19120/api/v1',\n", + " 'ref'='{ref}',\n", + " 'warehouse' = '{warehouse}')\"\"\"\n", + " )\n", + "\n", + "\n", + "create_ref_catalog(nessie_client.get_default_branch())\n", + "print(\"\\n\\n\\nFlink running\\n\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Solving Data Engineering problems with Nessie\n", + "============================\n", + "\n", + "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", + "\n", + "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set up Nessie branches (via Nessie CLI)\n", + "----------------------------\n", + "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", + "\n", + "- Create a new branch named `dev`\n", + "- List all branches\n", + "\n", + "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create_ref_catalog(\"dev\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", + "\n", + "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create tables under dev branch\n", + "-------------------------------------\n", + "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", + "\n", + "We create two tables under the `dev` branch:\n", + "- `salaries`\n", + "- `totals_stats`\n", + "\n", + "These tables list the salaries per player per year and their stats per year.\n", + "\n", + "To create the data we:\n", + "\n", + "1. switch our branch context to dev\n", + "2. create the table\n", + "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the dataset\n", + "from pyflink.table import DataTypes, Schema, TableDescriptor\n", + "\n", + "table_env.execute_sql(\"CREATE DATABASE dev_catalog.nba\").wait()\n", + "\n", + "# Creating `salaries` table\n", + "(\n", + " table_env.create_temporary_table(\n", + " \"dev_catalog.nba.salaries_temp\",\n", + " TableDescriptor.for_connector(\"filesystem\")\n", + " .schema(Schema.new_builder()\n", + " .column(\"Season\", DataTypes.STRING())\n", + " .column(\"Team\", DataTypes.STRING())\n", + " .column(\"Salary\", DataTypes.STRING())\n", + " .column(\"Player\", DataTypes.STRING())\n", + " .build())\n", + " .option(\"path\", \"../datasets/nba/salaries.csv\")\n", + " .format(\"csv\")\n", + " .build())\n", + ")\n", + "\n", + "table_env.execute_sql(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.salaries\n", + " (Season STRING, Team STRING, Salary STRING, Player STRING)\"\"\"\n", + ").wait()\n", + "\n", + "tab = table_env.from_path(\"dev_catalog.nba.salaries_temp\")\n", + "tab.execute_insert(\"dev_catalog.nba.salaries\").wait()\n", + "\n", + "# Creating `totals_stats` table\n", + "(\n", + " table_env.create_temporary_table(\n", + " \"dev_catalog.nba.totals_stats_temp\",\n", + " TableDescriptor.for_connector(\"filesystem\")\n", + " .schema(Schema.new_builder()\n", + " .column(\"Season\", DataTypes.STRING())\n", + " .column(\"Age\", DataTypes.STRING())\n", + " .column(\"Team\", DataTypes.STRING())\n", + " .column(\"ORB\", DataTypes.STRING())\n", + " .column(\"DRB\", DataTypes.STRING())\n", + " .column(\"TRB\", DataTypes.STRING())\n", + " .column(\"AST\", DataTypes.STRING())\n", + " .column(\"STL\", DataTypes.STRING())\n", + " .column(\"BLK\", DataTypes.STRING())\n", + " .column(\"TOV\", DataTypes.STRING())\n", + " .column(\"PTS\", DataTypes.STRING())\n", + " .column(\"Player\", DataTypes.STRING())\n", + " .column(\"RSorPO\", DataTypes.STRING())\n", + " .build())\n", + " .option(\"path\", \"../datasets/nba/totals_stats.csv\")\n", + " .format(\"csv\")\n", + " .build())\n", + ")\n", + "\n", + "table_env.execute_sql(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS dev_catalog.nba.totals_stats (Season STRING, Age STRING, Team STRING,\n", + " ORB STRING, DRB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING, PTS STRING,\n", + " Player STRING, RSorPO STRING)\"\"\"\n", + ").wait()\n", + "\n", + "tab = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\")\n", + "tab.execute_insert(\"dev_catalog.nba.totals_stats\").wait()\n", + "\n", + "salaries = table_env.from_path(\"main_catalog.nba.`salaries@dev`\").select(lit(1).count).to_pandas().values[0][0]\n", + "totals_stats = table_env.from_path(\"main_catalog.nba.`totals_stats@dev`\").select(lit(1).count).to_pandas().values[0][0]\n", + "print(f\"\\n\\n\\nAdded {salaries} rows to the salaries table and {totals_stats} rows to the totals_stats table.\\n\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Now we count the rows in our tables to ensure they are the same number as the csv files. Note we use the `table@branch` notation which overrides the context set by the catalog." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "table_count = table_env.from_path(\"dev_catalog.nba.`salaries@dev`\").select(\"Season.count\").to_pandas().values[0][0]\n", + "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", + "assert table_count == csv_count\n", + "print(table_count)\n", + "\n", + "table_count = table_env.from_path(\"dev_catalog.nba.`totals_stats@dev`\").select(\"Season.count\").to_pandas().values[0][0]\n", + "csv_count = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check generated tables\n", + "----------------------------\n", + "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", + "let's verify that the `main` branch was not altered by our changes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "And on the `dev` branch we expect to see two tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie content list --ref dev" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can also verify that the `dev` and `main` branches point to different commits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dev promotion into main\n", + "-----------------------\n", + "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", + "We merge `dev` into `main` via the command line `merge` command.\n", + "Both branches should be at the same revision after merging/promotion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie merge dev -b main --force" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can verify that the `main` branch now contains the expected tables and row counts.\n", + "\n", + "The tables are now on `main` and ready for consumption by our blog authors and analysts!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" } - ], - "metadata": { - "kernelspec": { - "display_name": "flink-demo", - "language": "python", - "name": "flink-demo" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" + }, + "outputs": [], + "source": [ + "table_count = table_env.from_path(\"main_catalog.nba.salaries\").select(\"Season.count\").to_pandas().values[0][0]\n", + "csv_count = table_env.from_path(\"dev_catalog.nba.salaries_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", + "assert table_count == csv_count\n", + "\n", + "table_count = table_env.from_path(\"main_catalog.nba.totals_stats\").select(\"Season.count\").to_pandas().values[0][0]\n", + "csv_count = table_env.from_path(\"dev_catalog.nba.totals_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", + "assert table_count == csv_count" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Perform regular ETL on the new tables\n", + "-------------------\n", + "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", + "\n", + "1. Update the salaries table to add new data\n", + "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", + "3. We create a new table to hold information about the players appearances in all star games\n", + "\n", + "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create_ref_catalog(\"etl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# add some salaries for Kevin Durant\n", + "table_env.execute_sql(\n", + " \"\"\"INSERT INTO etl_catalog.nba.salaries\n", + " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n", + " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n", + " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n", + " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n", + ").wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Rename the table `totals_stats` to `new_totals_stats`\n", + "table_env.execute_sql(\"ALTER TABLE etl_catalog.nba.totals_stats RENAME TO etl_catalog.nba.new_totals_stats\").wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating `allstar_games_stats` table\n", + "(\n", + " table_env.connect(FileSystem().path(\"../datasets/nba/allstar_games_stats.csv\"))\n", + " .with_schema(\n", + " Schema()\n", + " .field(\"Season\", DataTypes.STRING())\n", + " .field(\"Age\", DataTypes.STRING())\n", + " .field(\"Team\", DataTypes.STRING())\n", + " .field(\"ORB\", DataTypes.STRING())\n", + " .field(\"TRB\", DataTypes.STRING())\n", + " .field(\"AST\", DataTypes.STRING())\n", + " .field(\"STL\", DataTypes.STRING())\n", + " .field(\"BLK\", DataTypes.STRING())\n", + " .field(\"TOV\", DataTypes.STRING())\n", + " .field(\"PF\", DataTypes.STRING())\n", + " .field(\"PTS\", DataTypes.STRING())\n", + " .field(\"Player\", DataTypes.STRING())\n", + " )\n", + " .format(\"csv\")\n", + " .create_temporary_table(\"etl_catalog.nba.allstar_games_stats_temp\")\n", + ")\n", + "\n", + "table_env.execute_sql(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS etl_catalog.nba.allstar_games_stats (Season STRING, Age STRING,\n", + " Team STRING, ORB STRING, TRB STRING, AST STRING, STL STRING, BLK STRING, TOV STRING,\n", + " PF STRING, PTS STRING, Player STRING)\"\"\"\n", + ").wait()\n", + "\n", + "tab = table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\")\n", + "tab.execute_insert(\"etl_catalog.nba.allstar_games_stats\").wait()\n", + "\n", + "# Notice how we view the data on the etl branch via @etl\n", + "table_env.from_path(\"etl_catalog.nba.`allstar_games_stats@etl`\").to_pandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can verify that the new table isn't on the `main` branch but is present on the etl branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n", + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We should see `allstar_games_stats` and the `new_totals_stats` on the `etl` branch\n", + "!nessie content list --ref etl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we are happy with the data we can again merge it into `main`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie merge etl -b main --force" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Now lets verify that the changes exist on the `main` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "table_count = (\n", + " table_env.from_path(\"main_catalog.nba.allstar_games_stats\").select(\"Season.count\").to_pandas().values[0][0]\n", + ")\n", + "csv_count = (\n", + " table_env.from_path(\"etl_catalog.nba.allstar_games_stats_temp\").select(\"Season.count\").to_pandas().values[0][0]\n", + ")\n", + "assert table_count == csv_count" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create `experiment` branch\n", + "--------------------------------\n", + "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", + "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", + "and carry out our experiment, which could consist of the following steps:\n", + "- drop `totals_stats` table\n", + "- add data to `salaries` table\n", + "- compare `experiment` and `main` tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "create_ref_catalog(\"experiment\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the `new_totals_stats` table on the `experiment` branch\n", + "table_env.execute_sql(\"DROP TABLE experiment_catalog.nba.new_totals_stats\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add some salaries for Dirk Nowitzki\n", + "table_env.execute_sql(\n", + " \"\"\"INSERT INTO experiment_catalog.nba.salaries VALUES\n", + " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n", + " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n", + " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n", + " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", + ").wait()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `new_totals_stats`)\n", + "!nessie content list --ref experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# `main` hasn't changed been changed and still has the `new_totals_stats` table\n", + "!nessie content list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the contents of the `salaries` table on the `experiment` branch.\n", + "Notice the use of the `nessie` catalog and the use of `@experiment` to view data on the `experiment` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table_env.from_path(\"main_catalog.nba.`salaries@experiment`\").select(lit(1).count).to_pandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and compare to the contents of the `salaries` table on the `main` branch. Notice that we didn't have to specify `@branchName` as it defaulted\n", + "to the `main` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" } + }, + "outputs": [], + "source": [ + "table_env.from_path(\"main_catalog.nba.`salaries@main`\").select(lit(1).count).to_pandas()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "flink-demo", + "language": "python", + "name": "flink-demo" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/nessie-iceberg-hive-demo-nba.ipynb b/notebooks/nessie-iceberg-hive-demo-nba.ipynb index 8dd639f8..f10cfb5d 100644 --- a/notebooks/nessie-iceberg-hive-demo-nba.ipynb +++ b/notebooks/nessie-iceberg-hive-demo-nba.ipynb @@ -1,1084 +1,817 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "Nessie Iceberg/Hive SQL Demo with NBA Dataset\n", - "============================\n", - "This demo showcases how to use Nessie Python API along with Hive from Iceberg\n", - "\n", - "Initialize PyHive\n", - "----------------------------------------------\n", - "To get started, we will first have to do a few setup steps that give us everything we need\n", - "to get started with Nessie. In case you're interested in the detailed setup steps for Hive, you can check out the [docs](https://projectnessie.org/tools/iceberg/hive/)\n", - "\n", - "The Binder server has downloaded Hive, Hadoop and some data for us as well as started a Nessie server in the background. All we have to do is to connect to Hive session.\n", - "\n", - "The below cell starts a local Hive session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "Hive running\n", - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "import os\n", - "from pyhive import hive\n", - "from pynessie import init\n", - "\n", - "# where we will store our data\n", - "warehouse = \"file://\" + os.path.join(os.getcwd(), \"nessie_warehouse\")\n", - "\n", - "# where our datasets are located\n", - "datasets_path = \"file://\" + os.path.join(os.path.dirname(os.getcwd()), \"datasets\")\n", - "\n", - "nessie_client = init()\n", - "\n", - "\n", - "def create_ref_catalog(ref):\n", - " \"\"\"\n", - " Create a branch and switch the current ref to the created branch\n", - " \"\"\"\n", - " default_branch = nessie_client.get_default_branch()\n", - " if ref != default_branch:\n", - " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n", - " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n", - " return switch_ref_catalog(ref)\n", - "\n", - "\n", - "def switch_ref_catalog(ref):\n", - " \"\"\"\n", - " Switch a branch. When we switch the branch via Hive, we will need to reconnect to Hive\n", - " \"\"\"\n", - " # The important args below are:\n", - " # catalog-impl: which Iceberg catalog to use, in this case we want NessieCatalog\n", - " # uri: the location of the nessie server.\n", - " # ref: the Nessie ref/branch we want to use (defaults to main)\n", - " # warehouse: the location this catalog should store its data\n", - " return hive.connect(\n", - " \"localhost\",\n", - " configuration={\n", - " \"iceberg.catalog.dev_catalog.catalog-impl\": \"org.apache.iceberg.nessie.NessieCatalog\",\n", - " \"iceberg.catalog.dev_catalog.uri\": \"http://localhost:19120/api/v1\",\n", - " \"iceberg.catalog.dev_catalog.ref\": ref,\n", - " \"iceberg.catalog.dev_catalog.warehouse\": warehouse,\n", - " },\n", - " ).cursor()\n", - "\n", - "\n", - "print(\"\\n\\nHive running\\n\\n\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Solving Data Engineering problems with Nessie\n", - "============================\n", - "\n", - "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", - "\n", - "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Set up Nessie branches (via Nessie CLI)\n", - "----------------------------\n", - "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", - "\n", - "- Create a new branch named `dev`\n", - "- List all branches\n", - "\n", - "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "current_ref = create_ref_catalog(\"dev\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", - "\n", - "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dev 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[0m\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create tables under dev branch\n", - "-------------------------------------\n", - "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", - "\n", - "We create two tables under the `dev` branch:\n", - "- `salaries`\n", - "- `totals_stats`\n", - "\n", - "These tables list the salaries per player per year and their stats per year.\n", - "\n", - "To create the data we:\n", - "\n", - "1. switch our branch context to dev\n", - "2. create the table\n", - "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Created schema nba\n", - "\n", - "\n", - "Creating tables nba.salaries and nba.totals_stats....\n", - "\n", - "\n", - "Created and inserted data into table nba.salaries from dataset salaries\n", - "\n", - "\n", - "Created and inserted data into table nba.totals_stats from dataset totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# Creating our demo schema\n", - "current_ref.execute(\"CREATE SCHEMA IF NOT EXISTS nba\")\n", - "\n", - "print(\"\\nCreated schema nba\\n\")\n", - "\n", - "\n", - "print(\"\\nCreating tables nba.salaries and nba.totals_stats....\\n\")\n", - "\n", - "# Creating `salaries` table\n", - "\n", - "current_ref.execute(\n", - " f\"\"\"CREATE TABLE IF NOT EXISTS nba.salaries (Season STRING,\n", - " Team STRING, Salary STRING, Player STRING)\n", - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", - " LOCATION '{warehouse}/nba/salaries'\n", - " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", - " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", - ")\n", - "\n", - "## We create a temporary table to load data into our target table since\n", - "## is not possible to load data directly from CSV into non-native table.\n", - "current_ref.execute(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS nba.salaries_temp (Season STRING,\n", - " Team STRING, Salary STRING, Player STRING)\n", - " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", - ")\n", - "\n", - "current_ref.execute(f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/salaries.csv\" OVERWRITE INTO TABLE nba.salaries_temp')\n", - "current_ref.execute(\"INSERT OVERWRITE TABLE nba.salaries SELECT * FROM nba.salaries_temp\")\n", - "\n", - "print(\"\\nCreated and inserted data into table nba.salaries from dataset salaries\\n\")\n", - "\n", - "\n", - "# Creating `totals_stats` table\n", - "\n", - "current_ref.execute(\n", - " f\"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING,\n", - " DRB STRING, TRB STRING, AST STRING, STL STRING,\n", - " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", - " LOCATION '{warehouse}/nba/totals_stats'\n", - " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", - " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", - ")\n", - "\n", - "## We create a temporary table to load data into our target table since\n", - "## is not possible to load data directly from CSV into non-native table.\n", - "current_ref.execute(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats_temp (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING,\n", - " DRB STRING, TRB STRING, AST STRING, STL STRING,\n", - " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", - " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", - ")\n", - "\n", - "current_ref.execute(\n", - " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/totals_stats.csv\" OVERWRITE INTO TABLE nba.totals_stats_temp'\n", - ")\n", - "current_ref.execute(\"INSERT OVERWRITE TABLE nba.totals_stats SELECT * FROM nba.totals_stats_temp\")\n", - "\n", - "print(\"\\nCreated and inserted data into table nba.totals_stats from dataset totals_stats\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Now we count the rows in our tables to ensure they are the same number as the csv files. Unlike Spark and Flink demos, we can't use the notation of `table@branch` (see the github issue [here](https://github.com/projectnessie/nessie/issues/1985). Therefore, we just set Nessie ref settings through Hive setting `SET iceberg.catalog.{catalog}.ref = {branch}` whenever we want to work on a specific branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "tags": [] - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.salaries\n", - "\n", - "51\n", - "\n", - "Counting rows in nba.totals_stats\n", - "\n", - "93\n" - ] - } - ], - "source": [ - "# We make sure we are still in dev branch\n", - "current_ref = switch_ref_catalog(\"dev\")\n", - "\n", - "print(\"\\nCounting rows in nba.salaries\\n\")\n", - "\n", - "# We count now\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "print(\"\\nCounting rows in nba.totals_stats\\n\")\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check generated tables\n", - "----------------------------\n", - "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", - "let's verify that the `main` branch was not altered by our changes." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "And on the `dev` branch we expect to see two tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.totals_stats\n", - "\tnba.salaries\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list --ref dev" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can also verify that the `dev` and `main` branches point to different commits" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " dev d1ea40ccb14fd8365828bf740d73e8ed9d04ce5d9739020d00d7ffa5937cf9d3\n", - "\u001b[33m* main 2e1cfa82b035c26cbbbdae632cea070514eb8b773f616aaeaf668e2f0be8f10d\n", - "\u001b[0m\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dev promotion into main\n", - "-----------------------\n", - "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", - "We merge `dev` into `main` via the command line `merge` command.\n", - "Both branches should be at the same revision after merging/promotion." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie merge dev -b main --force" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "We can verify that the `main` branch now contains the expected tables and row counts.\n", - "\n", - "The tables are now on `main` and ready for consumption by our blog authors and analysts!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33m* main 330f993ac08aceb2252702611f6bf1a92f49ac2e3fc709b250a017ba4a9cded6\n", - "\u001b[0m dev d1ea40ccb14fd8365828bf740d73e8ed9d04ce5d9739020d00d7ffa5937cf9d3\n", - "\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.salaries\n", - "\n", - "51\n", - "\n", - "Counting rows in nba.totals_stats\n", - "\n", - "93\n" - ] - } - ], - "source": [ - "# We switch to main branch\n", - "current_ref = switch_ref_catalog(\"main\")\n", - "\n", - "print(\"\\nCounting rows in nba.salaries\\n\")\n", - "\n", - "# We count now\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)\n", - "\n", - "print(\"\\nCounting rows in nba.totals_stats\\n\")\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Perform regular ETL on the new tables\n", - "-------------------\n", - "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", - "\n", - "1. Update the salaries table to add new data\n", - "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", - "3. We create a new table to hold information about the players appearances in all star games\n", - "\n", - "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "current_ref = create_ref_catalog(\"etl\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "# add some salaries for Kevin Durant\n", - "current_ref.execute(\n", - " \"\"\"INSERT INTO nba.salaries\n", - " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n", - " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n", - " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n", - " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Creating table nba.allstar_games_stats\n", - "\n", - "\n", - "Created and inserted data into table nba.allstar_table_temp from dataset allstar_games_stats\n", - "\n", - "\n", - "Counting rows in nba.allstar_games_stats\n", - "\n", - "48\n" - ] - } - ], - "source": [ - "print(\"\\nCreating table nba.allstar_games_stats\\n\")\n", - "\n", - "# Creating `allstar_games_stats` table\n", - "current_ref.execute(\n", - " f\"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_games_stats (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING,\n", - " TRB STRING, AST STRING, STL STRING, BLK STRING,\n", - " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", - " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", - " LOCATION '{warehouse}/nba/allstar_games_stats'\n", - " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", - " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", - ")\n", - "\n", - "## We create a temporary table to load data into our target table since\n", - "## is not possible to load data directly from CSV into non-native table.\n", - "current_ref.execute(\n", - " \"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_table_temp (\n", - " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING,\n", - " AST STRING, STL STRING, BLK STRING,\n", - " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", - " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", - ")\n", - "\n", - "current_ref.execute(\n", - " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/allstar_games_stats.csv\" OVERWRITE INTO TABLE nba.allstar_table_temp'\n", - ")\n", - "current_ref.execute(\"INSERT OVERWRITE TABLE nba.allstar_games_stats SELECT * FROM nba.allstar_table_temp\")\n", - "\n", - "print(\"\\nCreated and inserted data into table nba.allstar_table_temp from dataset allstar_games_stats\\n\")\n", - "\n", - "\n", - "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n", - "\n", - "# Since we can't do 'table@branch'\n", - "current_ref = switch_ref_catalog(\"etl\")\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n", - "print(current_ref.fetchone()[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can verify that the new table isn't on the `main` branch but is present on the etl branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n", - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.allstar_games_stats\n", - "\tnba.salaries\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# We should see the new `allstar_games_stats` table on the `etl` branch\n", - "!nessie content list --ref etl" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we are happy with the data we can again merge it into `main`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - } - ], - "source": [ - "!nessie merge etl -b main --force" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%% md\n" - } - }, - "source": [ - "Now lets verify that the changes exist on the `main` branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.allstar_games_stats\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "!nessie content list" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\u001b[33m* main 11ed5b46713231a5fb85f31083d47dbf6bfa1df5839bebbac08301cda8afe22f\n", - "\u001b[0m etl a3e06ba7595dfdb8bc67b0d6825587d2858cfe2b013bf1b95c5a1471578c4af3\n", - " dev d1ea40ccb14fd8365828bf740d73e8ed9d04ce5d9739020d00d7ffa5937cf9d3\n", - "\n" - ] - } - ], - "source": [ - "!nessie --verbose branch" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.allstar_games_stats\n", - "\n", - "48\n" - ] - } - ], - "source": [ - "# We switch to the main branch\n", - "current_ref = switch_ref_catalog(\"main\")\n", - "\n", - "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n", - "\n", - "# We count now\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n", - "table_count = current_ref.fetchone()[0]\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_table_temp\")\n", - "csv_count = current_ref.fetchone()[0]\n", - "assert table_count == csv_count\n", - "print(table_count)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create `experiment` branch\n", - "--------------------------------\n", - "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", - "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", - "and carry out our experiment, which could consist of the following steps:\n", - "- drop `totals_stats` table\n", - "- add data to `salaries` table\n", - "- compare `experiment` and `main` tables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "current_ref = create_ref_catalog(\"experiment\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Drop the `totals_stats` table on the `experiment` branch\n", - "current_ref.execute(\"DROP TABLE nba.totals_stats\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# add some salaries for Dirk Nowitzki\n", - "current_ref.execute(\n", - " \"\"\"INSERT INTO nba.salaries VALUES\n", - " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n", - " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n", - " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n", - " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.allstar_games_stats\n", - "\n" - ] - } - ], - "source": [ - "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `totals_stats`)\n", - "!nessie content list --ref experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "jupyter": { - "outputs_hidden": false - }, - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ICEBERG_TABLE:\n", - "\tnba.salaries\n", - "\tnba.allstar_games_stats\n", - "\tnba.totals_stats\n", - "\n" - ] - } - ], - "source": [ - "# `main` hasn't been changed and still has the `totals_stats` table\n", - "!nessie content list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Let's take a look at the contents of the `salaries` table on the `experiment` branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.salaries\n", - "\n", - "59\n" - ] - } - ], - "source": [ - "current_ref = switch_ref_catalog(\"experiment\")\n", - "\n", - "print(\"\\nCounting rows in nba.salaries\\n\")\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", - "print(current_ref.fetchone()[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "and compare to the contents of the `salaries` table on the `main` branch." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Counting rows in nba.salaries\n", - "\n", - "56\n" - ] - } - ], - "source": [ - "current_ref = switch_ref_catalog(\"main\")\n", - "\n", - "# the following INSERT is a workaround for https://github.com/apache/iceberg/pull/4509 until iceberg 0.13.2 is released\n", - "# add a single salary for Dirk Nowitzki (so we expect 3 less total rows)\n", - "current_ref.execute(\n", - " \"\"\"INSERT INTO nba.salaries VALUES\n", - " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", - ")\n", - "\n", - "print(\"\\nCounting rows in nba.salaries\\n\")\n", - "\n", - "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", - "print(current_ref.fetchone()[0])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And finally lets clean up after ourselves" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "\n" - ] - } - ], - "source": [ - "!nessie branch --delete dev\n", - "!nessie branch --delete etl\n", - "!nessie branch --delete experiment" - ] + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "Nessie Iceberg/Hive SQL Demo with NBA Dataset\n", + "============================\n", + "This demo showcases how to use Nessie Python API along with Hive from Iceberg\n", + "\n", + "Initialize PyHive\n", + "----------------------------------------------\n", + "To get started, we will first have to do a few setup steps that give us everything we need\n", + "to get started with Nessie. In case you're interested in the detailed setup steps for Hive, you can check out the [docs](https://projectnessie.org/tools/iceberg/hive/)\n", + "\n", + "The Binder server has downloaded Hive, Hadoop and some data for us as well as started a Nessie server in the background. All we have to do is to connect to Hive session.\n", + "\n", + "The below cell starts a local Hive session with parameters needed to configure Nessie. Each config option is followed by a comment explaining its purpose." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "from pyhive import hive\n", + "from pynessie import init\n", + "\n", + "# where we will store our data\n", + "warehouse = \"file://\" + os.path.join(os.getcwd(), \"nessie_warehouse\")\n", + "\n", + "# where our datasets are located\n", + "datasets_path = \"file://\" + os.path.join(os.path.dirname(os.getcwd()), \"datasets\")\n", + "\n", + "nessie_client = init()\n", + "\n", + "\n", + "def create_ref_catalog(ref):\n", + " \"\"\"\n", + " Create a branch and switch the current ref to the created branch\n", + " \"\"\"\n", + " default_branch = nessie_client.get_default_branch()\n", + " if ref != default_branch:\n", + " default_branch_hash = nessie_client.get_reference(default_branch).hash_\n", + " nessie_client.create_branch(ref, ref=default_branch, hash_on_ref=default_branch_hash)\n", + " return switch_ref_catalog(ref)\n", + "\n", + "\n", + "def switch_ref_catalog(ref):\n", + " \"\"\"\n", + " Switch a branch. When we switch the branch via Hive, we will need to reconnect to Hive\n", + " \"\"\"\n", + " # The important args below are:\n", + " # catalog-impl: which Iceberg catalog to use, in this case we want NessieCatalog\n", + " # uri: the location of the nessie server.\n", + " # ref: the Nessie ref/branch we want to use (defaults to main)\n", + " # warehouse: the location this catalog should store its data\n", + " return hive.connect(\n", + " \"localhost\",\n", + " configuration={\n", + " \"iceberg.catalog.dev_catalog.catalog-impl\": \"org.apache.iceberg.nessie.NessieCatalog\",\n", + " \"iceberg.catalog.dev_catalog.uri\": \"http://localhost:19120/api/v1\",\n", + " \"iceberg.catalog.dev_catalog.ref\": ref,\n", + " \"iceberg.catalog.dev_catalog.warehouse\": warehouse,\n", + " },\n", + " ).cursor()\n", + "\n", + "\n", + "print(\"\\n\\nHive running\\n\\n\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Solving Data Engineering problems with Nessie\n", + "============================\n", + "\n", + "In this Demo we are a data engineer working at a fictional sports analytics blog. In order for the authors to write articles they have to have access to the relevant data. They need to be able to retrieve data quickly and be able to create charts with it.\n", + "\n", + "We have been asked to collect and expose some information about basketball players. We have located some data sources and are now ready to start ingesting data into our data lakehouse. We will perform the ingestion steps on a Nessie branch to test and validate the data before exposing to the analysts." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set up Nessie branches (via Nessie CLI)\n", + "----------------------------\n", + "Once all dependencies are configured, we can get started with ingesting our basketball data into `Nessie` with the following steps:\n", + "\n", + "- Create a new branch named `dev`\n", + "- List all branches\n", + "\n", + "It is worth mentioning that we don't have to explicitly create a `main` branch, since it's the default branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_ref = create_ref_catalog(\"dev\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" + }, + "source": [ + "We have created the branch `dev` and we can see the branch with the Nessie `hash` its currently pointing to.\n", + "\n", + "Below we list all branches. Note that the auto created `main` branch already exists and both branches point at the same empty `hash` initially" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" } + }, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create tables under dev branch\n", + "-------------------------------------\n", + "Once we created the `dev` branch and verified that it exists, we can create some tables and add some data.\n", + "\n", + "We create two tables under the `dev` branch:\n", + "- `salaries`\n", + "- `totals_stats`\n", + "\n", + "These tables list the salaries per player per year and their stats per year.\n", + "\n", + "To create the data we:\n", + "\n", + "1. switch our branch context to dev\n", + "2. create the table\n", + "3. insert the data from an existing csv file. This csv file is already stored locally on the demo machine. A production use case would likely take feeds from official data sources" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating our demo schema\n", + "current_ref.execute(\"CREATE SCHEMA IF NOT EXISTS nba\")\n", + "\n", + "print(\"\\nCreated schema nba\\n\")\n", + "\n", + "\n", + "print(\"\\nCreating tables nba.salaries and nba.totals_stats....\\n\")\n", + "\n", + "# Creating `salaries` table\n", + "\n", + "current_ref.execute(\n", + " f\"\"\"CREATE TABLE IF NOT EXISTS nba.salaries (Season STRING,\n", + " Team STRING, Salary STRING, Player STRING)\n", + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", + " LOCATION '{warehouse}/nba/salaries'\n", + " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", + " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", + ")\n", + "\n", + "## We create a temporary table to load data into our target table since\n", + "## is not possible to load data directly from CSV into non-native table.\n", + "current_ref.execute(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS nba.salaries_temp (Season STRING,\n", + " Team STRING, Salary STRING, Player STRING)\n", + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", + ")\n", + "\n", + "current_ref.execute(f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/salaries.csv\" OVERWRITE INTO TABLE nba.salaries_temp')\n", + "current_ref.execute(\"INSERT OVERWRITE TABLE nba.salaries SELECT * FROM nba.salaries_temp\")\n", + "\n", + "print(\"\\nCreated and inserted data into table nba.salaries from dataset salaries\\n\")\n", + "\n", + "\n", + "# Creating `totals_stats` table\n", + "\n", + "current_ref.execute(\n", + " f\"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING,\n", + " DRB STRING, TRB STRING, AST STRING, STL STRING,\n", + " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", + " LOCATION '{warehouse}/nba/totals_stats'\n", + " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", + " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", + ")\n", + "\n", + "## We create a temporary table to load data into our target table since\n", + "## is not possible to load data directly from CSV into non-native table.\n", + "current_ref.execute(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS nba.totals_stats_temp (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING,\n", + " DRB STRING, TRB STRING, AST STRING, STL STRING,\n", + " BLK STRING, TOV STRING, PTS STRING, Player STRING, RSorPO STRING)\n", + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", + ")\n", + "\n", + "current_ref.execute(\n", + " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/totals_stats.csv\" OVERWRITE INTO TABLE nba.totals_stats_temp'\n", + ")\n", + "current_ref.execute(\"INSERT OVERWRITE TABLE nba.totals_stats SELECT * FROM nba.totals_stats_temp\")\n", + "\n", + "print(\"\\nCreated and inserted data into table nba.totals_stats from dataset totals_stats\\n\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Now we count the rows in our tables to ensure they are the same number as the csv files. Unlike Spark and Flink demos, we can't use the notation of `table@branch` (see the github issue [here](https://github.com/projectnessie/nessie/issues/1985). Therefore, we just set Nessie ref settings through Hive setting `SET iceberg.catalog.{catalog}.ref = {branch}` whenever we want to work on a specific branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# We make sure we are still in dev branch\n", + "current_ref = switch_ref_catalog(\"dev\")\n", + "\n", + "print(\"\\nCounting rows in nba.salaries\\n\")\n", + "\n", + "# We count now\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)\n", + "\n", + "print(\"\\nCounting rows in nba.totals_stats\\n\")\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check generated tables\n", + "----------------------------\n", + "Since we have been working solely on the `dev` branch, where we created 2 tables and added some data,\n", + "let's verify that the `main` branch was not altered by our changes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "And on the `dev` branch we expect to see two tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie content list --ref dev" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can also verify that the `dev` and `main` branches point to different commits" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dev promotion into main\n", + "-----------------------\n", + "Once we are done with our changes on the `dev` branch, we would like to merge those changes into `main`.\n", + "We merge `dev` into `main` via the command line `merge` command.\n", + "Both branches should be at the same revision after merging/promotion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie merge dev -b main --force" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "We can verify that the `main` branch now contains the expected tables and row counts.\n", + "\n", + "The tables are now on `main` and ready for consumption by our blog authors and analysts!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# We switch to main branch\n", + "current_ref = switch_ref_catalog(\"main\")\n", + "\n", + "print(\"\\nCounting rows in nba.salaries\\n\")\n", + "\n", + "# We count now\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)\n", + "\n", + "print(\"\\nCounting rows in nba.totals_stats\\n\")\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.totals_stats_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Perform regular ETL on the new tables\n", + "-------------------\n", + "Our analysts are happy with the data and we want to now regularly ingest data to keep things up to date. Our first ETL job consists of the following:\n", + "\n", + "1. Update the salaries table to add new data\n", + "2. We have decided the `Age` column isn't required in the `totals_stats` table so we will drop the column\n", + "3. We create a new table to hold information about the players appearances in all star games\n", + "\n", + "As always we will do this work on a branch and verify the results. This ETL job can then be set up to run nightly with new stats and salary information." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_ref = create_ref_catalog(\"etl\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# add some salaries for Kevin Durant\n", + "current_ref.execute(\n", + " \"\"\"INSERT INTO nba.salaries\n", + " VALUES ('2017-18', 'Golden State Warriors', '$25000000', 'Kevin Durant'),\n", + " ('2018-19', 'Golden State Warriors', '$30000000', 'Kevin Durant'),\n", + " ('2019-20', 'Brooklyn Nets', '$37199000', 'Kevin Durant'),\n", + " ('2020-21', 'Brooklyn Nets', '$39058950', 'Kevin Durant')\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\nCreating table nba.allstar_games_stats\\n\")\n", + "\n", + "# Creating `allstar_games_stats` table\n", + "current_ref.execute(\n", + " f\"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_games_stats (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING,\n", + " TRB STRING, AST STRING, STL STRING, BLK STRING,\n", + " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", + " STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'\n", + " LOCATION '{warehouse}/nba/allstar_games_stats'\n", + " TBLPROPERTIES ('iceberg.catalog'='dev_catalog', 'write.format.default'='parquet',\n", + " 'iceberg.mr.in.memory.data.model'='GENERIC')\"\"\"\n", + ")\n", + "\n", + "## We create a temporary table to load data into our target table since\n", + "## is not possible to load data directly from CSV into non-native table.\n", + "current_ref.execute(\n", + " \"\"\"CREATE TABLE IF NOT EXISTS nba.allstar_table_temp (\n", + " Season STRING, Age STRING, Team STRING, ORB STRING, TRB STRING,\n", + " AST STRING, STL STRING, BLK STRING,\n", + " TOV STRING, PF STRING, PTS STRING, Player STRING)\n", + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ','\"\"\"\n", + ")\n", + "\n", + "current_ref.execute(\n", + " f'LOAD DATA LOCAL INPATH \"{datasets_path}/nba/allstar_games_stats.csv\" OVERWRITE INTO TABLE nba.allstar_table_temp'\n", + ")\n", + "current_ref.execute(\"INSERT OVERWRITE TABLE nba.allstar_games_stats SELECT * FROM nba.allstar_table_temp\")\n", + "\n", + "print(\"\\nCreated and inserted data into table nba.allstar_table_temp from dataset allstar_games_stats\\n\")\n", + "\n", + "\n", + "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n", + "\n", + "# Since we can't do 'table@branch'\n", + "current_ref = switch_ref_catalog(\"etl\")\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n", + "print(current_ref.fetchone()[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can verify that the new table isn't on the `main` branch but is present on the etl branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Since we have been working on the `etl` branch, the `allstar_games_stats` table is not on the `main` branch\n", + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We should see the new `allstar_games_stats` table on the `etl` branch\n", + "!nessie content list --ref etl" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we are happy with the data we can again merge it into `main`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie merge etl -b main --force" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "Now lets verify that the changes exist on the `main` branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie content list" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "!nessie --verbose branch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# We switch to the main branch\n", + "current_ref = switch_ref_catalog(\"main\")\n", + "\n", + "print(\"\\nCounting rows in nba.allstar_games_stats\\n\")\n", + "\n", + "# We count now\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_games_stats\")\n", + "table_count = current_ref.fetchone()[0]\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.allstar_table_temp\")\n", + "csv_count = current_ref.fetchone()[0]\n", + "assert table_count == csv_count\n", + "print(table_count)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create `experiment` branch\n", + "--------------------------------\n", + "As a data analyst we might want to carry out some experiments with some data, without affecting `main` in any way.\n", + "As in the previous examples, we can just get started by creating an `experiment` branch off of `main`\n", + "and carry out our experiment, which could consist of the following steps:\n", + "- drop `totals_stats` table\n", + "- add data to `salaries` table\n", + "- compare `experiment` and `main` tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_ref = create_ref_catalog(\"experiment\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Drop the `totals_stats` table on the `experiment` branch\n", + "current_ref.execute(\"DROP TABLE nba.totals_stats\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# add some salaries for Dirk Nowitzki\n", + "current_ref.execute(\n", + " \"\"\"INSERT INTO nba.salaries VALUES\n", + " ('2015-16', 'Dallas Mavericks', '$8333333', 'Dirk Nowitzki'),\n", + " ('2016-17', 'Dallas Mavericks', '$25000000', 'Dirk Nowitzki'),\n", + " ('2017-18', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki'),\n", + " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We should see the `salaries` and `allstar_games_stats` tables only (since we just dropped `totals_stats`)\n", + "!nessie content list --ref experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "jupyter": { + "outputs_hidden": false + }, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "# `main` hasn't been changed and still has the `totals_stats` table\n", + "!nessie content list" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's take a look at the contents of the `salaries` table on the `experiment` branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_ref = switch_ref_catalog(\"experiment\")\n", + "\n", + "print(\"\\nCounting rows in nba.salaries\\n\")\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", + "print(current_ref.fetchone()[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and compare to the contents of the `salaries` table on the `main` branch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "current_ref = switch_ref_catalog(\"main\")\n", + "\n", + "# the following INSERT is a workaround for https://github.com/apache/iceberg/pull/4509 until iceberg 0.13.2 is released\n", + "# add a single salary for Dirk Nowitzki (so we expect 3 less total rows)\n", + "current_ref.execute(\n", + " \"\"\"INSERT INTO nba.salaries VALUES\n", + " ('2018-19', 'Dallas Mavericks', '$5000000', 'Dirk Nowitzki')\"\"\"\n", + ")\n", + "\n", + "print(\"\\nCounting rows in nba.salaries\\n\")\n", + "\n", + "current_ref.execute(\"SELECT COUNT(*) FROM nba.salaries\")\n", + "print(current_ref.fetchone()[0])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And finally lets clean up after ourselves" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!nessie branch --delete dev\n", + "!nessie branch --delete etl\n", + "!nessie branch --delete experiment" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/requirements_dev.txt b/notebooks/requirements_dev.txt index 3d705656..fbbb4280 100644 --- a/notebooks/requirements_dev.txt +++ b/notebooks/requirements_dev.txt @@ -15,18 +15,18 @@ # -r requirements.txt assertpy==1.1 +build==0.10.0 bump2version==1.0.1 -build==0.8.0 -ipython==7.34.0 -jupyterlab==3.4.7 +ipython==8.18.0 +jupyterlab==3.6.6 nbstripout==0.6.1 -pip==22.2.2 -pytest==7.1.3 -pytest-mock==3.8.2 -pytest-mypy==0.9.1 +pip==23.3.1 +pytest==7.4.3 +pytest-mock==3.12.0 +pytest-mypy==0.10.3 pytest-runner==6.0.0 testbook[dev]==0.4.2 -tox==3.26.0 +tox==4.11.3 twine==4.0.1 watchdog==2.1.9 -wheel==0.37.1 +wheel==0.41.3 diff --git a/notebooks/requirements_lint.txt b/notebooks/requirements_lint.txt index 57328b6c..6936aebc 100644 --- a/notebooks/requirements_lint.txt +++ b/notebooks/requirements_lint.txt @@ -14,14 +14,13 @@ # limitations under the License. # -r requirements_dev.txt -bandit==1.7.4 -black[jupyter]==22.8.0 -flake8==5.0.4 -flake8-annotations==2.9.1 +bandit==1.7.5 +black[jupyter]==23.11.0 +flake8==6.1.0 +flake8-annotations==3.0.1 flake8-bandit==4.1.1 -flake8-black==0.3.3 -flake8-bugbear==22.9.23 -flake8-docstrings==1.6.0 -flake8-import-order==0.18.1 -pytest-mypy==0.9.1 -safety==2.2.0 +flake8-black==0.3.6 +flake8-bugbear==23.9.16 +flake8-docstrings==1.7.0 +flake8-import-order==0.18.2 +pytest-mypy==0.10.3 diff --git a/notebooks/tests/__init__.py b/notebooks/tests/__init__.py index 4c3decb3..3e363460 100644 --- a/notebooks/tests/__init__.py +++ b/notebooks/tests/__init__.py @@ -1,4 +1,5 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- # # Copyright (C) 2020 Dremio # @@ -16,14 +17,12 @@ # """Unit tests for demo notebooks.""" import os -import platform import shutil import subprocess # noqa: S404 from contextlib import contextmanager from typing import Iterator from typing import List -from utils import fetch_nessie from utils import fetch_nessie_jar @@ -66,12 +65,5 @@ def start_nessie() -> Iterator[subprocess.Popen]: def _fetch_and_get_nessie_start_command() -> List[str]: - operating_system = platform.system().lower() - - if operating_system == "darwin": - # In Mac case, we use the nessie jar - runner = fetch_nessie_jar() - return ["java", "-jar", runner] - else: - runner = fetch_nessie() - return ["./" + runner] + runner = fetch_nessie_jar() + return ["java", "-jar", runner] diff --git a/notebooks/tox.ini b/notebooks/tox.ini index 553d6e3f..4c251edf 100644 --- a/notebooks/tox.ini +++ b/notebooks/tox.ini @@ -15,22 +15,22 @@ # [tox] -envlist = py37, format, lint, flink, hive +envlist = py310, format, lint, flink, hive skipsdist = True [gh-actions] python = - 3.7: py37, lint, flink, hive + 3.10: py310, lint, flink, hive [testenv:format] allowlist_externals=bash deps = -r{toxinidir}/requirements_lint.txt commands = - black --target-version py37 tests format_notebooks.py + black --target-version py310 tests ./format_notebooks.py python -m format_notebooks # this formats python code inside the notebooks - bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py37 --line-length 120 --ipynb' + bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py310 --line-length 120 --ipynb' # this formats cell output from single string to list of strings and removes execution metadata bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose nbstripout --keep-output --drop-empty-cells' python -m format_notebooks @@ -40,15 +40,14 @@ allowlist_externals=bash deps = -r{toxinidir}/requirements_lint.txt commands = - flake8 tests format_notebooks.py - bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py37 --line-length 120 --ipynb --check' + flake8 tests ./format_notebooks.py + bash -euo pipefail -c 'ls -1 *.ipynb | xargs --verbose black --target-version py310 --line-length 120 --ipynb --check' [testenv:flink] setenv = PYTHONPATH = {toxinidir}:{toxinidir}/../docker passenv = TOXENV,CI,CODECOV_* deps = - --use-deprecated=legacy-resolver -r{toxinidir}/../docker/binder/requirements_flink.txt -r{toxinidir}/requirements_dev.txt commands = @@ -76,5 +75,5 @@ deps = -r{toxinidir}/../docker/binder/requirements.txt -r{toxinidir}/requirements_dev.txt commands = - nbstripout {toxinidir}/nessie-iceberg-demo-nba.ipynb {toxinidir}/nessie-delta-demo-nba.ipynb + nbstripout {toxinidir}/nessie-iceberg-demo-nba.ipynb pytest --basetemp={envtmpdir} -ra tests --ignore tests/test_nessie_iceberg_flink_demo_nba.py --ignore tests/test_nessie_iceberg_hive_demo_nba.py