GoogleCloudDataproc · jphalip · Oct 31, 2023 · Nov 6, 2023 · Nov 17, 2023 · Nov 22, 2023
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,8 @@
 
 ## Next
 
+* Added support for Hive 2.X.
+* Added support for Spark SQL.
 * Fixed case sensitivity bug with column names. This particularly affected pseudo columns like
   `_PARTITIONTIME` and `_PARTITIONDATE` in time-ingestion partitioned BigQuery tables.
 * **Backward-incompatible change:** The type of the `_PARTITION_TIME` pseudo-column in

diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ software versions:
 * Hive 2.3.6, 2.3.9, 3.1.2, and 3.1.3.
 * Hadoop 2.10.2, 3.2.3, and 3.3.3.
 * Tez 0.9.2 on Hadoop 2, and Tez 0.10.1 on Hadoop 3.
+* Spark SQL 3.4.1.
 
 ## Installation
 
@@ -474,6 +475,59 @@ session creation time (i.e. when the `SELECT` query is initiated).
 
 Note that this consistency model currently only applies to the table data, not its metadata.
 
+## Spark SQL integration
+
+Dataproc uses a patched version of Spark that automatically detects a table that has the `bq.table`
+table property, in which case Spark will use the [`Spark-BigQuery Connector`](https://github.com/GoogleCloudDataproc/spark-bigquery-connector)
+to access the table's data. This means that on Dataproc you actually do not need to use the
+Hive-BigQuery Connector for Spark SQL.
+
+However, if you want to use Spark SQL outside of Dataproc (e.g. in a self-managed Hadoop cluster
+on-premise or in a different cloud) to access BigQuery tables, then you must do the following:
+
+* Use Spark 3, which is currently the supported version. We plan to add support for Spark 2 in the
+  future –– stay tuned for that if that is the version you need.
+* Install the "Hive 2" version of the Hive-BigQuery Connector. This is because Spark 3 itself
+  vendors Hive 2 in its codebase. See more information in the [Installation](#installation) section
+  on how to install the appropriate connector version in your environment.
+* To be able to run `INSERT` queries, set the `spark.sql.extensions` configuration property to
+  register the connector's Spark extension:
+  ```xml
+  <property>
+  <name>spark.sql.extensions</name>
+  <value>com.google.cloud.hive.bigquery.connector.sparksql.HiveBigQuerySparkSQLExtension</value>
+  </property>
+  ```
+  This property isn't necessary if you just need to read data with `SELECT` queries.
+
+### Code samples
+
+Java example:
+
+```java
+SparkConf sparkConf = new SparkConf().setMaster("local");
+SparkSession spark =
+    SparkSession.builder()
+    .appName("example")
+    .config(sparkConf)
+    .enableHiveSupport()
+    .getOrCreate();
+Dataset<Row> ds = spark.sql("SELECT * FROM mytable");
+Row[] rows = ds.collect();
+```
+
+Python example:
+
+```python
+spark = SparkSession.builder \
+    .appName("example") \
+    .config("spark.master", "local") \
+    .enableHiveSupport() \
+    .getOrCreate()
+df = spark.sql("SELECT * FROM mytable")
+rows = df.collect()
+```
+
 ## BigLake integration
 
 [BigLake](https://cloud.google.com/biglake) allows you to store your data in open formats

diff --git a/cloudbuild/cloudbuild.yaml b/cloudbuild/cloudbuild.yaml
@@ -1,10 +1,10 @@
 steps:
-# 1. Create a Docker image containing hadoop-connectors repo
+# 0. Create a Docker image containing hadoop-connectors repo
 - name: 'gcr.io/cloud-builders/docker'
   id: 'docker-build'
   args: ['build', '--tag=gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit', '-f', 'cloudbuild/Dockerfile', '.']
 
-# 2. Build the connector and download dependencies without running tests.
+# 1. Build the connector and download dependencies without running tests.
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'check'
   waitFor: ['docker-build']
@@ -13,7 +13,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 3. Build the connector and download dependencies without running tests.
+# 2. Build the connector and download dependencies without running tests.
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'build'
   waitFor: ['check']
@@ -22,7 +22,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 4. Run unit tests for Hive 2
+# 3. Run unit tests for Hive 2
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'unit-tests-hive2'
   waitFor: ['build']
@@ -31,7 +31,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 5. Run unit tests for Hive 3
+# 4. Run unit tests for Hive 3
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'unit-tests-hive3'
   waitFor: ['build']
@@ -40,7 +40,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 6. Run integration tests for Hive 2
+# 5. Run integration tests for Hive 2
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'integration-tests-hive2'
   waitFor: ['unit-tests-hive2']
@@ -49,7 +49,7 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# 7. Run integration tests for Hive 3
+# 6. Run integration tests for Hive 3
 - name: 'gcr.io/$PROJECT_ID/dataproc-hive-bigquery-connector-presubmit'
   id: 'integration-tests-hive3'
   waitFor: ['unit-tests-hive3']
@@ -58,8 +58,8 @@ steps:
   env:
     - 'CODECOV_TOKEN=${_CODECOV_TOKEN}'
 
-# Tests should take under 90 mins
-timeout: 5400s
+# Tests should take under 120 mins
+timeout: 7200s
 
 options:
   machineType: 'N1_HIGHCPU_32'
diff --git a/cloudbuild/presubmit.sh b/cloudbuild/presubmit.sh
@@ -26,6 +26,7 @@ readonly ACTION=$1
 
 readonly HIVE2_PROFILE="hive2-generic"
 readonly HIVE3_PROFILE="hive3-generic"
+readonly HIVE3_SHADED_DEPS="shaded-deps-hive3.1.2-hadoop2.10.2"
 readonly MVN="./mvnw -B -e -Dmaven.repo.local=/workspace/.repository"
 
 export TEST_BUCKET=dataproc-integ-tests
@@ -37,16 +38,16 @@ cd /workspace
 case "$ACTION" in
   # Java code style check
   check)
-    ./mvnw spotless:check -P"${HIVE2_PROFILE}" && ./mvnw spotless:check -P"${HIVE3_PROFILE}"
+    $MVN spotless:check -P"${HIVE2_PROFILE}" && $MVN spotless:check -P"${HIVE3_PROFILE}"
     exit
     ;;
 
-  # Download maven and all the dependencies
+  # Build the Maven packages and dependencies
   build)
-    # Install all modules for Hive 2, including parent modules
+    # Install all modules for Hive 2
     $MVN install -DskipTests -P"${HIVE2_PROFILE}"
-    # Install the shaded deps for Hive 3 (all the other shaded & parent modules have already been installed with the previous command)
-    $MVN install -DskipTests -P"${HIVE3_PROFILE}" -pl shaded-deps-${HIVE3_PROFILE}
+    # Install the shaded dependencies for Hive 3 (all the other shaded & parent modules have already been installed with the previous command)
+    $MVN install -DskipTests -P"${HIVE3_PROFILE}" -pl ${HIVE3_SHADED_DEPS}
     exit
     ;;
 

diff --git a/hive-2-bigquery-connector/pom.xml b/hive-2-bigquery-connector/pom.xml
@@ -36,14 +36,6 @@
             <scope>test</scope>
         </dependency>
 
-        <dependency>
-            <groupId>${project.groupId}</groupId>
-            <artifactId>shaded-acceptance-tests-dependencies</artifactId>
-            <version>${project.version}</version>
-            <classifier>shaded</classifier>
-            <scope>test</scope>
-        </dependency>
-
         <dependency>
             <groupId>io.github.hiverunner</groupId>
             <artifactId>hiverunner</artifactId>
@@ -53,6 +45,26 @@
     </dependencies>
 
     <profiles>
+        <profile>
+            <!-- Currently the same as "hive2.3.9-hadoop2.10.2" but could be changed later -->
+            <!-- Use this profile if you don't care about specific minor versions of Hive 2.X -->
+            <id>hive2-generic</id>
+            <dependencies>
+                <dependency>
+                    <groupId>${project.groupId}</groupId>
+                    <artifactId>shaded-deps-hive2.3.9-hadoop2.10.2</artifactId>
+                    <version>${project.version}</version>
+                    <classifier>shaded</classifier>
+                    <scope>provided</scope>
+                </dependency>
+                <dependency>
+                    <groupId>${project.groupId}</groupId>
+                    <artifactId>shaded-deps-sparksql</artifactId>
+                    <version>${project.version}</version>
+                    <scope>provided</scope>
+                </dependency>
+            </dependencies>
+        </profile>
         <profile>
             <id>hive2.3.6-hadoop2.7.0</id>
             <properties>
@@ -70,6 +82,12 @@
                     <classifier>shaded</classifier>
                     <scope>provided</scope>
                 </dependency>
+                <dependency>
+                    <groupId>${project.groupId}</groupId>
+                    <artifactId>shaded-deps-sparksql</artifactId>
+                    <version>${project.version}</version>
+                    <scope>provided</scope>
+                </dependency>
             </dependencies>
         </profile>
         <profile>
@@ -82,6 +100,12 @@
                     <classifier>shaded</classifier>
                     <scope>provided</scope>
                 </dependency>
+                <dependency>
+                    <groupId>${project.groupId}</groupId>
+                    <artifactId>shaded-deps-sparksql</artifactId>
+                    <version>${project.version}</version>
+                    <scope>provided</scope>
+                </dependency>
             </dependencies>
         </profile>
     </profiles>

diff --git a/...t/java/com/google/cloud/hive/bigquery/connector/integration/SparkSQLIntegrationTests.java b/...t/java/com/google/cloud/hive/bigquery/connector/integration/SparkSQLIntegrationTests.java
@@ -0,0 +1,22 @@
+/*
+ * Copyright 2023 Google Inc. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *       http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.google.cloud.hive.bigquery.connector.integration;
+
+public class SparkSQLIntegrationTests extends SparkSQLIntegrationTestsBase {
+
+  // Tests are from the super-class
+
+}
diff --git a/hive-3-bigquery-connector/pom.xml b/hive-3-bigquery-connector/pom.xml
@@ -36,14 +36,6 @@
       <scope>test</scope>
     </dependency>
 
-    <dependency>
-      <groupId>${project.groupId}</groupId>
-      <artifactId>shaded-acceptance-tests-dependencies</artifactId>
-      <version>${project.version}</version>
-      <classifier>shaded</classifier>
-      <scope>test</scope>
-    </dependency>
-
     <dependency>
       <groupId>io.github.hiverunner</groupId>
       <artifactId>hiverunner</artifactId>
@@ -52,8 +44,21 @@
 
   </dependencies>
 
-
   <profiles>
+    <profile>
+      <!-- Currently the same as "hive3.1.2-hadoop2.10.2" but could be changed later -->
+      <!-- Use this profile if you don't care about specific minor versions of Hive 3.X -->
+      <id>hive3-generic</id>
+      <dependencies>
+        <dependency>
+          <groupId>${project.groupId}</groupId>
+          <artifactId>shaded-deps-hive3.1.2-hadoop2.10.2</artifactId>
+          <version>${project.version}</version>
+          <classifier>shaded</classifier>
+          <scope>provided</scope>
+        </dependency>
+      </dependencies>
+    </profile>
     <profile>
       <id>hive3.1.2-hadoop2.10.2</id>
       <dependencies>
@@ -76,6 +81,13 @@
           <classifier>shaded</classifier>
           <scope>provided</scope>
         </dependency>
+        <dependency>
+          <groupId>${project.groupId}</groupId>
+          <artifactId>shaded-acceptance-tests-dependencies</artifactId>
+          <version>${project.version}</version>
+          <classifier>shaded</classifier>
+          <scope>test</scope>
+        </dependency>
       </dependencies>
     </profile>
     <profile>
@@ -88,11 +100,17 @@
           <classifier>shaded</classifier>
           <scope>provided</scope>
         </dependency>
+        <dependency>
+          <groupId>${project.groupId}</groupId>
+          <artifactId>shaded-acceptance-tests-dependencies</artifactId>
+          <version>${project.version}</version>
+          <classifier>shaded</classifier>
+          <scope>test</scope>
+        </dependency>
       </dependencies>
     </profile>
   </profiles>
 
-
   <build>
     <plugins>
       <plugin>