From 37ce4e2a398bb0bd83eedb0e4b961d7463de8569 Mon Sep 17 00:00:00 2001 From: nruest Date: Fri, 17 Jan 2020 10:07:32 -0500 Subject: [PATCH] Fix for #19. - Adds work around for au shaded tika issue - Update Spark to 2.4.4 - Update to aut-0.18.1 - README updates - See also: https://github.com/archivesunleashed/aut/issues/407 - See also: https://github.com/archivesunleashed/aut/releases/tag/aut-0.18.1 --- Dockerfile | 11 +++++++++-- README.md | 14 ++++++-------- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index b1084c8..7d71d8b 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,7 @@ LABEL website="https://archivesunleashed.org/" ## Build variables ######################### -ARG SPARK_VERSION=2.4.3 +ARG SPARK_VERSION=2.4.4 # Git and Wget RUN apk add --update \ @@ -18,6 +18,13 @@ RUN apk add --update \ # Sample resources RUN git clone https://github.com/archivesunleashed/aut-resources.git /aut-resources +# Build aut (workaround for https://github.com/archivesunleashed/docker-aut/issues/19) + +RUN git clone https://github.com/archivesunleashed/aut.git /aut \ + && cd /aut \ + && git checkout 59b60621500246f48051466005d6a5dc59f74369 \ + && mvn clean install + # Spark shell RUN mkdir /spark \ && cd /tmp \ @@ -25,4 +32,4 @@ RUN mkdir /spark \ && tar -xf "/tmp/spark-$SPARK_VERSION-bin-hadoop2.7.tgz" -C /spark --strip-components=1 \ && rm "/tmp/spark-$SPARK_VERSION-bin-hadoop2.7.tgz" -CMD /spark/bin/spark-shell --packages "io.archivesunleashed:aut:0.18.0" +CMD /spark/bin/spark-shell --packages "io.archivesunleashed:aut:0.18.1" diff --git a/README.md b/README.md index 5ce96e4..2b5874e 100644 --- a/README.md +++ b/README.md @@ -38,27 +38,25 @@ If you want to mount your own data: You can add any Spark flags to the build if you need too. ``` -$ docker run --rm -it archivesunleashed/docker-aut:0.18.0 /spark/bin/spark-shell --packages "io.archivesunleashed:aut:0.18.0" --conf spark.network.timeout=100000000 --conf spark.executor.heartbeatInterval=6000s +$ docker run --rm -it archivesunleashed/docker-aut:0.18.0 /spark/bin/spark-shell --packages "io.archivesunleashed:aut:0.18.1" --conf spark.network.timeout=100000000 --conf spark.executor.heartbeatInterval=6000s ``` Once the build finishes, you should see: ```bash $ docker run --rm -it aut + +Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties Setting default log level to "WARN". To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel). -2017-12-08 00:28:03,803 [main] WARN NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable -2017-12-08 00:28:10,965 [main] WARN ObjectStore - Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 1.2.0 -2017-12-08 00:28:11,130 [main] WARN ObjectStore - Failed to get database default, returning NoSuchObjectException -2017-12-08 00:28:12,068 [main] WARN ObjectStore - Failed to get database global_temp, returning NoSuchObjectException -Spark context Web UI available at http://172.17.0.2:4040 -Spark context available as 'sc' (master = local[*], app id = local-1512692884451). +Spark context Web UI available at http://fee0a4330af9:4040 +Spark context available as 'sc' (master = local[*], app id = local-1579273425545). Spark session available as 'spark'. Welcome to ____ __ / __/__ ___ _____/ /__ _\ \/ _ \/ _ `/ __/ '_/ - /___/ .__/\_,_/_/ /_/\_\ version 2.4.3 + /___/ .__/\_,_/_/ /_/\_\ version 2.4.4 /_/ Using Scala version 2.11.12 (OpenJDK 64-Bit Server VM, Java 1.8.0_212)