diff --git a/Dockerfile b/Dockerfile index 5a0da7a..a48f4c9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.6-slim-buster +FROM python:3.6-slim-stretch # ADD REPO FOR JDK RUN echo "deb http://ftp.us.debian.org/debian sid main" >> /etc/apt/sources.list \ diff --git a/README.md b/README.md index caf9d48..a4044a4 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,15 @@ Note the last 3 resources are mandatory for the the glue-compatible hive connect Don't forget to also add S3 IAM permissions for Spark to be able to fetch table data! +### GCP Bigquery/GCS credentials + +You must provide a valid path to a GCP service account file using environment variable `GOOGLE_APPLICATION_CREDENTIALS`. +Otherwise you have to set manually an Access Token after the Spark Context is created using + +```python +spark.conf.set("gcpAccessToken", "") +``` + ## Current release - 📄 [spark-2.4.5-bin-hadoop2.8-glue.tgz](https://github.com/tinyclues/spark-glue-data-catalog/releases/download/1.0/spark-2.4.5-bin-hadoop2.8-glue.tgz) @@ -50,6 +59,7 @@ Don't forget to also add S3 IAM permissions for Spark to be able to fetch table - Hadoop 2.8.5 - Hive 1.2.1 - AWS SDK 1.11.682 + - Bigquery Connector 0.18.1 ## Miscellaneous diff --git a/build-spark.sh b/build-spark.sh index bbae454..433d868 100755 --- a/build-spark.sh +++ b/build-spark.sh @@ -6,6 +6,7 @@ SPARK_VERSION=2.4.5 HADOOP_VERSION=2.8.5 HIVE_VERSION=1.2.1 AWS_SDK_VERSION=1.11.682 +BIGQUERY_CONNECTOR_VERSION=0.19.0 # BUILD HIVE FOR HIVE v1 - needed for spark client git clone https://github.com/apache/hive.git /opt/hive @@ -39,7 +40,7 @@ find /opt/glue -name "*.jar" -exec cp {} jars \; # Copy configuration cp /conf/* conf # Copy AWS jars -echo :quit | ./bin/spark-shell --conf spark.jars.packages=com.amazonaws:aws-java-sdk:$AWS_SDK_VERSION,org.apache.hadoop:hadoop-aws:$HADOOP_VERSION +echo :quit | ./bin/spark-shell --conf spark.jars.packages=com.amazonaws:aws-java-sdk:$AWS_SDK_VERSION,org.apache.hadoop:hadoop-aws:$HADOOP_VERSION,com.google.cloud.spark:spark-bigquery-with-dependencies_2.11:$BIGQUERY_CONNECTOR_VERSION,com.google.cloud.bigdataoss:gcs-connector:hadoop2-2.2.0 cp /root/.ivy2/jars/*.jar jars # Create archive DIRNAME=spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION%.*}-glue diff --git a/conf/spark-defaults.conf b/conf/spark-defaults.conf index e80b221..4ee3e61 100644 --- a/conf/spark-defaults.conf +++ b/conf/spark-defaults.conf @@ -6,6 +6,9 @@ spark.hadoop.fs.s3n.impl org.apache.hadoop.fs.s3native.NativeS3FileSystem spark.hadoop.fs.s3a.experimental.fadvise random +spark.hadoop.fs.gs.impl com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem +spark.hadoop.fs.AbstractFileSystem.gs.impl com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS + spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version 2 spark.hadoop.mapreduce.fileoutputcommitter.cleanup-failures.ignored true