WIP

projectnessie · Nov 27, 2023 · 75e2b09 · 75e2b09
1 parent 2fbd1be
commit 75e2b09
Show file tree

Hide file tree

Showing 23 changed files with 2,452 additions and 5,172 deletions.
diff --git a/.github/workflows/demos-docker-build.yaml b/.github/workflows/demos-docker-build.yaml
@@ -34,7 +34,7 @@ jobs:
     strategy:
       max-parallel: 4
       matrix:
-        python-version: [3.7]
+        python-version: ['3.10']
 
     steps:
     - uses: actions/checkout@v3

diff --git a/.github/workflows/notebooks.yaml b/.github/workflows/notebooks.yaml
@@ -38,12 +38,25 @@ jobs:
     strategy:
       max-parallel: 4
       matrix:
-        python-version: [3.7]
+        python-version: ['3.10']
 
     steps:
       - uses: actions/checkout@v3
       - name: Install system dependencies
         run: sudo apt-get install libsasl2-dev libsasl2-modules
+      - name: Set up Java
+        uses: actions/setup-java@v3
+        with:
+          distribution: 'temurin'
+          # Need Java 8 for Hive + 11 for Spark (and Nessie)
+          java-version: |
+            8
+            11
+      - name: setup JAVAx_HOME
+        run: |
+          echo "JAVA8_HOME=$JAVA_HOME_8_X64" >> ${GITHUB_ENV}
+          echo "JAVA11_HOME=$JAVA_HOME_11_X64" >> ${GITHUB_ENV}
+          echo "JAVA_HOME=$JAVA_HOME_11_X64" >> ${GITHUB_ENV}
       - name: Set up Python ${{ matrix.python-version }}
         uses: actions/setup-python@v4
         with:
@@ -54,6 +67,20 @@ jobs:
         run: |
           python -m pip install --upgrade pip
           python -m pip install tox tox-gh-actions build
+      - name: Cache Tools (Hadoop, Hive, Spark)
+        id: cache-tools
+        uses: actions/cache@v3
+        with:
+          path: |
+            notebooks/hadoop-*
+            notebooks/apache-hive-*
+            notebooks/spark-*
+            notebooks/iceberg-*.jar
+            notebooks/nessie-quarkus-*.jar
+          key: tools-cache-${{ hashFiles('docker/utils/__init__.py') }}
+      - name: Check Dockerfile has correct registry
+        run: |
+          grep -q 'FROM ghcr.io/projectnessie/nessie-binder-demos:.*' binder/Dockerfile
       - name: Check Dockerfile hash is up-to-date
         if: github.actor != 'renovate'
         run: |
@@ -67,5 +94,10 @@ jobs:
           fi
           echo "PASSED: Dockerfile hash is up-to-date!"
       - name: Test Notebooks with Tox
-        working-directory: notebooks/tests
+        working-directory: notebooks/
         run: tox
+      - name: Dump Hive output on error
+        working-directory: notebooks/
+        if: failure()
+        run: |
+          cat nohup.out
diff --git a/.gitignore b/.gitignore
@@ -25,6 +25,7 @@ notebooks/iceberg-*-runtime-*
 notebooks/hadoop-*
 notebooks/apache-hive-*-bin
 notebooks/metastore_db
+notebooks/hiveserver2.pid
 notebooks/*.log
 notebooks/*.out
 # using sed on mac always needs a backup file
@@ -38,6 +39,9 @@ venv/
 __pycache__/
 .pytest_cache
 
+# pyenv
+.python-version
+
 # Jetbrains IDEs
 /.idea
 *.iws

diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@ Nessie version is set in Binder at `docker/binder/requirements_base.txt`. Curren
 
 ### Iceberg
 
-Currently we are using Iceberg `0.13.1` and it is specified in both iceberg notebooks as well as `docker/utils/__init__.py`
+Currently we are using Iceberg `1.4.2` and it is specified in both iceberg notebooks as well as `docker/utils/__init__.py`
 
 ### Spark
 
@@ -30,7 +30,7 @@ Only has to be updated in `docker/binder/requirements.txt`. Currently, Iceberg s
 
 ### Flink
 
-Flink version is set in Binder at `docker/binder/requirements_flink.txt`. Currently, we are using `1.13.6`.
+Flink version is set in Binder at `docker/binder/requirements_flink.txt`. Currently, we are using `1.17.1`.
 
 ### Hadoop
 
@@ -53,7 +53,7 @@ Of course, Binder just lets a user "simply start" a notebook via a simple "click
 
 ## Development
 For development, you will need to make sure to have the following installed:
-- Python 3.7+
+- Python 3.10+
 - pre-commit
 
 Regarding pre-commit, you will need to make sure is installed through `pre-commit install` in order to install the hooks locally since this repo

diff --git a/binder/Dockerfile b/binder/Dockerfile
@@ -2,7 +2,7 @@
 
 # Tag will be automatically generated through pre-commit hook if any changes
 # happened in the docker/ folder
-FROM ghcr.io/projectnessie/nessie-binder-demos:649ec80b8fa7d9666178380a33b2e645a52d5985
+FROM ghcr.io/projectnessie/nessie-binder-demos:dd32c4413d91c22676121f62119bcc7f167e4752
 
 # Create the necessary folders for the demo, this will be created and owned by {NB_USER}
 RUN mkdir -p notebooks && mkdir -p datasets

diff --git a/binder/README.md b/binder/README.md
@@ -1,8 +1,8 @@
 ## Building binder locally
 
 ### Prerequisites
-You need to have a python 3.7+ installed. 
-We recommend to use [pyenv](https://github.com/pyenv/pyenv) for managing your python environment(s). 
+You need to have a python 3.10+ installed.
+We recommend to use [pyenv](https://github.com/pyenv/pyenv) for managing your python environment(s).
 
 To build the binder image locally, firstly, you need to install `jupyter-repo2docker` dependency:
 
@@ -29,8 +29,8 @@ Run (or look into) the `build_run_local_docker.sh` script how to do this semi-au
 After those steps, the binder should be running on your local machine.
 Next, find the output similar to this:
 ```shell
-[C 13:38:25.199 NotebookApp] 
-    
+[C 13:38:25.199 NotebookApp]
+
     To access the notebook, open this file in a browser:
         file:///home/jovyan/.local/share/jupyter/runtime/nbserver-40-open.html
     Or copy and paste this URL:

diff --git a/docker/binder/apt.txt b/docker/binder/apt.txt
@@ -16,9 +16,12 @@
 
 # Packages needed for mybinder.org
 
+openjdk-8-jdk-headless
 openjdk-11-jdk-headless
 # SASL lib needed for thrift API to access Hive
 libsasl2-dev
 libsasl2-modules
 # for removal of duplicate files
 rdfind
+# need `netstat` for start scripts
+net-tools
diff --git a/docker/binder/postBuild b/docker/binder/postBuild
@@ -26,7 +26,7 @@ python -m ipykernel install --name "flink-demo" --user
 python -c "import utils;utils._copy_all_hadoop_jars_to_pyflink()"
 conda deactivate
 
-python -c "import utils;utils.fetch_nessie()"
+python -c "import utils;utils.fetch_nessie_jar()"
 
 python -c "import utils;utils.fetch_spark()"
 

diff --git a/docker/binder/requirements.txt b/docker/binder/requirements.txt
@@ -1,5 +1,7 @@
 -r requirements_base.txt
 findspark==2.0.1
-pandas==1.3.5
-pyhive[hive]==0.6.5
-pyspark==3.2.1
+# Need this numpy version due to compatibility reasons with numpy/pyspark
+numpy==1.21.6
+pandas==1.5.3
+pyhive[hive_pure_sasl]==0.7.0
+pyspark==3.2.4
diff --git a/docker/binder/requirements_base.txt b/docker/binder/requirements_base.txt
@@ -1 +1 @@
-pynessie==0.30.0
+pynessie==0.65.0
diff --git a/docker/binder/requirements_flink.txt b/docker/binder/requirements_flink.txt
@@ -1,4 +1,2 @@
 -r requirements_base.txt
-apache-flink==1.13.6
-# flink requires pandas<1.2.0 see https://github.com/apache/flink/blob/release-1.13.6/flink-python/setup.py#L313
-pandas==1.1.5
+apache-flink==1.17.1
diff --git a/docker/binder/runtime.txt b/docker/binder/runtime.txt
@@ -0,0 +1 @@
+python-3.10
diff --git a/docker/binder/start b/docker/binder/start
@@ -15,19 +15,33 @@
 # limitations under the License.
 #
 
-nohup ./nessie-quarkus-runner &
-
 SPARK_VERSION=$(python -c "import utils;print(utils._SPARK_VERSION)")
 HADOOP_VERSION=$(python -c "import utils;print(utils._HADOOP_VERSION)")
 HIVE_VERSION=$(python -c "import utils;print(utils._HIVE_VERSION)")
 
-export JAVA_HOME=/usr/lib/jvm/java-11-openjdk-amd64
+export JAVA11_HOME=/usr/lib/jvm/java-11-openjdk-amd64
+export JAVA8_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+export JAVA_HOME=$JAVA11_HOME
+export PATH=$JAVA_HOME/bin:$PATH
+
+nohup java -jar nessie-quarkus-runner.jar > nohup-nessie.out &
+# Wait until Nessie is ready to accept requests.
+echo "Waiting up to 180 seconds for Nessie to be ready..."
+for _ in seq 1 180 ; do
+  (netstat -ant | grep -q ':19120 .* LISTEN') && break
+  sleep 1
+done
+if [[ $(netstat -ant | grep -q ':19120 .* LISTEN') -gt 0 ]] ; then
+  echo "Nessie did not start / not listening on port 19120!"
+  exit 1
+fi
+echo "Nessie listening on port 19120."
 
 export SPARK_HOME=$PWD/spark-$SPARK_VERSION-bin-hadoop3.2
 export HADOOP_HOME=$PWD/hadoop-$HADOOP_VERSION
 
 #Start Hive
 chmod +x $PWD/binder/start.hive
-nohup $PWD/binder/start.hive $PWD $PWD/binder/resources $HIVE_VERSION
+nohup $PWD/binder/start.hive $PWD $PWD/binder/resources $HIVE_VERSION > nohup-hive.out
 
 exec "$@"
diff --git a/docker/binder/start.hive b/docker/binder/start.hive
@@ -20,6 +20,8 @@ RESOURCE_DIR=$2
 HIVE_VERSION=$3
 HIVE_FOLDER_NAME="apache-hive-$HIVE_VERSION-bin"
 HIVE_WAREHOUSE_DIR=$HIVE_PARENT_DIR/hive_warehouse
+HIVE_PID_FILE=$HIVE_PARENT_DIR/hiveserver2.pid
+HIVE_DB=$HIVE_PARENT_DIR/metastore_db
 
 if [ -z "$HIVE_PARENT_DIR" ]; then
   echo "Input the parent dir as the first argument"
@@ -38,21 +40,84 @@ fi
 
 export HIVE_HOME=$HIVE_PARENT_DIR/$HIVE_FOLDER_NAME
 
-# Create hive warehouse folder
-mkdir $HIVE_WAREHOUSE_DIR
-
 # Copy the needed configs to Hive folder
 cp $RESOURCE_DIR/hive/config/hive-site.xml ${HIVE_HOME}/conf/
 
 # Set Hive warehouse path in the hive-site.xml
 sed -i.bak "s~HIVE_WAREHOUSE_DIR~$HIVE_WAREHOUSE_DIR~g" ${HIVE_HOME}/conf/hive-site.xml
 
+# Check for Java 8 + 11 for tox (also in /notebooks/tests/scripts/start_hive)
+if [[ -z ${JAVA8_HOME} || -z ${JAVA11_HOME} || ! -d ${JAVA8_HOME} || ! -d ${JAVA11_HOME} ]] ; then
+  cat <<! > /dev/stderr
+
+
+============================================================================================================
+Define the JAVA8_HOME and JAVA11_HOME environment variables to point to Java 8 and Java 11 development kits.
+============================================================================================================
+
+Need Java 8 for Hive server to work.
+Java 11 (not newer!) is required for Spark, but also Nessie.
+
+
+!
+  exit 1
+fi
+
+# Kill an already running hiveserver
+if [[ -f $HIVE_PID_FILE ]] ; then
+  kill "$(cat $HIVE_PID_FILE)" || true
+  rm $HIVE_PID_FILE
+fi
+
+# Remove an already metastore-db
+if [[ -d $HIVE_DB ]] ; then
+  echo "Removing existing $HIVE_DB"
+  rm -rf $HIVE_DB
+fi
+
+# (Re-)create hive warehouse folder
+rm -rf $HIVE_WAREHOUSE_DIR
+mkdir -p $HIVE_WAREHOUSE_DIR
+
 # Initialize Hive's Derby database
 $HIVE_HOME/bin/schematool -dbType derby -initSchema
 echo "Finished initializing Derby database for Hive."
 
 # increase the Heap memory being used by Hive-MapReduce jobs
 export HADOOP_HEAPSIZE=1500
 
+# Use Java 8 for Hive :facepalm:
+OLD_PATH="$PATH"
+export PATH="$JAVA8_HOME/bin:$PATH"
+export JAVA_HOME=$JAVA8_HOME
+cat <<!
+
+For Hive Server:
+================
+Using JAVA_HOME=$JAVA_HOME
+java binary: $(which java)
+$(java -version)
+
+!
+
 # Once we are done from initializing the database, we start Hive
-$HIVE_HOME/bin/hive --service hiveserver2 --hiveconf hive.server2.thrift.port=10000 &
+echo "Starting Hive..."
+$HIVE_HOME/bin/hive --service hiveserver2 --hiveconf hive.server2.thrift.port=10000 --hiveconf hive.root.logger=INFO,console &
+echo $! > $HIVE_PID_FILE
+echo "... PID is $(cat $HIVE_PID_FILE)"
+
+# Wait until Hive is ready to accept requests via Thrift. Hive may take some time to start in CI.
+echo "Waiting up to 180 seconds for Hive to be ready..."
+for _ in seq 1 180 ; do
+  (netstat -ant | grep -q ':10000 .* LISTEN') && break
+  sleep 1
+done
+if [[ $(netstat -ant | grep -q ':10000 .* LISTEN') -gt 0 ]] ; then
+  echo "Hive did not start / not listening on port 10000 (Thrift)!"
+  exit 1
+fi
+echo "Hive listening on port 10000 (Thrift)."
+
+# Reset environment
+export JAVA_HOME=$JAVA11_HOME
+export PATH=$OLD_PATH