From 13239f97c98618827a3731045f35b0bc05d817ff Mon Sep 17 00:00:00 2001 From: GitHub Action Website Snapshot <> Date: Mon, 18 Nov 2024 07:07:14 +0000 Subject: [PATCH] Refreshing website content from main repo. Source commit: https://github.com/OpenLineage/OpenLineage/commit/fbc00b34404e261878eb2824fad8fa5ebbaec7a6 --- .../integrations/spark/configuration/usage.md | 72 ++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/docs/integrations/spark/configuration/usage.md b/docs/integrations/spark/configuration/usage.md index f4f1d79..6ef8cfa 100644 --- a/docs/integrations/spark/configuration/usage.md +++ b/docs/integrations/spark/configuration/usage.md @@ -7,7 +7,7 @@ title: Usage import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; -Configuring the OpenLineage Spark integration is straightforward. It uses built-in Spark configuration mechanisms. +Configuring the OpenLineage Spark integration is straightforward. It uses built-in Spark configuration mechanisms. However, for **Databricks users**, special considerations are required to ensure compatibility and avoid breaking the Spark UI after a cluster shutdown. Your options are: @@ -27,6 +27,10 @@ The setting `config("spark.extraListeners", "io.openlineage.spark.agent.OpenLine the integration ineffective. ::: +:::note +Databricks For Databricks users, you must include `com.databricks.backend.daemon.driver.DBCEventLoggingListener` in addition to `io.openlineage.spark.agent.OpenLineageSparkListener` in the `spark.extraListeners` setting. Failure to do so will make the Spark UI inaccessible after a cluster shutdown. +::: + @@ -50,6 +54,27 @@ object OpenLineageExample extends App { spark.stop() } + +// For Databricks +import org.apache.spark.sql.SparkSession + +object OpenLineageExample extends App { + val spark = SparkSession.builder() + .appName("OpenLineageExample") + // This line is EXTREMELY important + .config("spark.extraListeners", "io.openlineage.spark.agent.OpenLineageSparkListener,com.databricks.backend.daemon.driver.DBCEventLoggingListener") + .config("spark.openlineage.transport.type", "http") + .config("spark.openlineage.transport.url", "http://localhost:5000") + .config("spark.openlineage.namespace", "spark_namespace") + .config("spark.openlineage.parentJobNamespace", "airflow_namespace") + .config("spark.openlineage.parentJobName", "airflow_dag.airflow_task") + .config("spark.openlineage.parentRunId", "xxxx-xxxx-xxxx-xxxx") + .getOrCreate() + + // ... your code + + spark.stop() +} ``` @@ -71,6 +96,24 @@ spark = SparkSession.builder # ... your code +spark.stop() + +# For Databricks +from pyspark.sql import SparkSession + +spark = SparkSession.builder + .appName("OpenLineageExample") + .config("spark.extraListeners", "io.openlineage.spark.agent.OpenLineageSparkListener,com.databricks.backend.daemon.driver.DBCEventLoggingListener") + .config("spark.openlineage.transport.type", "http") + .config("spark.openlineage.transport.url", "http://localhost:5000") + .config("spark.openlineage.namespace", "spark_namespace") + .config("spark.openlineage.parentJobNamespace", "airflow_namespace") + .config("spark.openlineage.parentJobName", "airflow_dag.airflow_task") + .config("spark.openlineage.parentRunId", "xxxx-xxxx-xxxx-xxxx") + .getOrCreate() + +# ... your code + spark.stop() ``` @@ -81,6 +124,10 @@ spark.stop() The below example demonstrates how to use the `--conf` option with `spark-submit`. +:::note +Databricks Remember to include `com.databricks.backend.daemon.driver.DBCEventLoggingListener` along with the OpenLineage listener. +::: + ```bash spark-submit \ --conf "spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener" \ @@ -91,6 +138,17 @@ spark-submit \ --conf "spark.openlineage.parentJobName=airflow_dag.airflow_task" \ --conf "spark.openlineage.parentRunId=xxxx-xxxx-xxxx-xxxx" \ # ... other options + +# For Databricks +spark-submit \ + --conf "spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener,com.databricks.backend.daemon.driver.DBCEventLoggingListener" \ + --conf "spark.openlineage.transport.type=http" \ + --conf "spark.openlineage.transport.url=http://localhost:5000" \ + --conf "spark.openlineage.namespace=spark_namespace" \ + --conf "spark.openlineage.parentJobNamespace=airflow_namespace" \ + --conf "spark.openlineage.parentJobName=airflow_dag.airflow_task" \ + --conf "spark.openlineage.parentRunId=xxxx-xxxx-xxxx-xxxx" \ + # ... other options ``` #### Adding properties to the `spark-defaults.conf` file in the `${SPARK_HOME}/conf` directory @@ -104,6 +162,10 @@ installation, particularly in a shared environment. The below example demonstrates how to add properties to the `spark-defaults.conf` file. +:::note +Databricks For Databricks users, include `com.databricks.backend.daemon.driver.DBCEventLoggingListener` in the `spark.extraListeners` property. +::: + ```properties spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener spark.openlineage.transport.type=http @@ -111,6 +173,14 @@ spark.openlineage.transport.url=http://localhost:5000 spark.openlineage.namespace=MyNamespace ``` +For Databricks, +```properties +spark.extraListeners=io.openlineage.spark.agent.OpenLineageSparkListener,com.databricks.backend.daemon.driver.DBCEventLoggingListener +spark.openlineage.transport.type=http +spark.openlineage.transport.url=http://localhost:5000 +spark.openlineage.namespace=MyNamespace +``` + :::info The `spark.extraListeners` configuration parameter is **non-additive**. This means that if you set `spark.extraListeners` via the CLI or via `SparkSession#config`, it will **replace** the value