From cf3634a6a9233b98d45560beb8cbabf6f17ff166 Mon Sep 17 00:00:00 2001
From: Tamas Nemeth <treff7es@gmail.com>
Date: Thu, 17 Oct 2024 19:57:02 +0200
Subject: [PATCH 01/17] fix(ingest/bigquery): Not setting platform instance for
 bigquery platform resources (#11659)

---
 .../ingestion/source/bigquery_v2/bigquery.py  |  3 -
 .../bigquery_platform_resource_helper.py      |  2 +-
 .../bigquery_v2/bigquery_mcp_golden.json      | 90 +++++++++----------
 3 files changed, 45 insertions(+), 50 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index c30dade921d25..3c6202cc7cbfa 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -65,9 +65,6 @@
 # We can't use close as it is not called if the ingestion is not successful
 def cleanup(config: BigQueryV2Config) -> None:
     if config._credentials_path is not None:
-        logger.debug(
-            f"Deleting temporary credential file at {config._credentials_path}"
-        )
         os.unlink(config._credentials_path)
 
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py
index d2da895be985d..9da2aceb19220 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py
@@ -42,7 +42,7 @@ def platform_resource_key(self) -> PlatformResourceKey:
         return PlatformResourceKey(
             platform="bigquery",
             resource_type="BigQueryLabelInfo",
-            platform_instance=self.project,
+            platform_instance=None,
             primary_key=self.label.primary_key(),
         )
 
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json
index b268926f155b7..640ee1bf436b0 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json
+++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json
@@ -201,7 +201,7 @@
 },
 {
     "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2",
+    "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b",
     "changeType": "UPSERT",
     "aspectName": "platformResourceInfo",
     "aspect": {
@@ -221,30 +221,29 @@
     },
     "systemMetadata": {
         "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
         "lastRunId": "no-run-id-provided"
     }
 },
 {
     "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2",
+    "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b",
     "changeType": "UPSERT",
     "aspectName": "dataPlatformInstance",
     "aspect": {
         "json": {
-            "platform": "urn:li:dataPlatform:bigquery",
-            "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+            "platform": "urn:li:dataPlatform:bigquery"
         }
     },
     "systemMetadata": {
         "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
         "lastRunId": "no-run-id-provided"
     }
 },
 {
-    "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+    "entityType": "platformResource",
+    "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b",
     "changeType": "UPSERT",
     "aspectName": "status",
     "aspect": {
@@ -254,13 +253,13 @@
     },
     "systemMetadata": {
         "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
         "lastRunId": "no-run-id-provided"
     }
 },
 {
     "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3",
+    "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916",
     "changeType": "UPSERT",
     "aspectName": "platformResourceInfo",
     "aspect": {
@@ -280,19 +279,50 @@
     },
     "systemMetadata": {
         "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
         "lastRunId": "no-run-id-provided"
     }
 },
 {
     "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3",
+    "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916",
     "changeType": "UPSERT",
     "aspectName": "dataPlatformInstance",
     "aspect": {
         "json": {
-            "platform": "urn:li:dataPlatform:bigquery",
-            "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)"
+            "platform": "urn:li:dataPlatform:bigquery"
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1643871600000,
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "platformResource",
+    "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916",
+    "changeType": "UPSERT",
+    "aspectName": "status",
+    "aspect": {
+        "json": {
+            "removed": false
+        }
+    },
+    "systemMetadata": {
+        "lastObserved": 1643871600000,
+        "runId": "bigquery-2022_02_03-07_00_00-2j2qqv",
+        "lastRunId": "no-run-id-provided"
+    }
+},
+{
+    "entityType": "dataset",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",
+    "changeType": "UPSERT",
+    "aspectName": "status",
+    "aspect": {
+        "json": {
+            "removed": false
         }
     },
     "systemMetadata": {
@@ -395,38 +425,6 @@
         "lastRunId": "no-run-id-provided"
     }
 },
-{
-    "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2",
-    "changeType": "UPSERT",
-    "aspectName": "status",
-    "aspect": {
-        "json": {
-            "removed": false
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
-        "lastRunId": "no-run-id-provided"
-    }
-},
-{
-    "entityType": "platformResource",
-    "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3",
-    "changeType": "UPSERT",
-    "aspectName": "status",
-    "aspect": {
-        "json": {
-            "removed": false
-        }
-    },
-    "systemMetadata": {
-        "lastObserved": 1643871600000,
-        "runId": "bigquery-2022_02_03-07_00_00",
-        "lastRunId": "no-run-id-provided"
-    }
-},
 {
     "entityType": "dataset",
     "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)",

From 6f193221c957b4adcc0f15b1e0af0679c1665272 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 11:50:42 -0700
Subject: [PATCH 02/17] fix(ingest/dbt): fix bug in CLL pruning (#11614)

---
 .../src/datahub/ingestion/source/dbt/dbt_common.py            | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
index c15f1deb43a3a..4cd3c934ce634 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -1074,8 +1074,8 @@ def add_node_to_cll_list(dbt_name: str) -> None:
             for upstream in all_nodes_map[dbt_name].upstream_nodes:
                 schema_nodes.add(upstream)
 
-                upstream_node = all_nodes_map[upstream]
-                if upstream_node.is_ephemeral_model():
+                upstream_node = all_nodes_map.get(upstream)
+                if upstream_node and upstream_node.is_ephemeral_model():
                     add_node_to_cll_list(upstream)
 
             cll_nodes.add(dbt_name)

From 523a456f5c9f166c5e99a9cdb33d9b421e199505 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 12:33:04 -0700
Subject: [PATCH 03/17] fix(ingest/redshift): fix syntax error in temp sql
 (#11661)

---
 .../src/datahub/ingestion/source/redshift/exception.py    | 8 ++++----
 .../src/datahub/ingestion/source/redshift/query.py        | 5 ++---
 .../tests/unit/redshift/redshift_query_mocker.py          | 2 +-
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py
index 43ad5bfcefdf1..ed0856fc1e292 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py
@@ -40,25 +40,25 @@ def report_redshift_failure(
     error_message = str(e).lower()
     if "permission denied" in error_message:
         if "svv_table_info" in error_message:
-            report.report_failure(
+            report.failure(
                 title="Permission denied",
                 message="Failed to extract metadata due to insufficient permission to access 'svv_table_info' table. Please ensure the provided database user has access.",
                 exc=e,
             )
         elif "svl_user_info" in error_message:
-            report.report_failure(
+            report.failure(
                 title="Permission denied",
                 message="Failed to extract metadata due to insufficient permission to access 'svl_user_info' table. Please ensure the provided database user has access.",
                 exc=e,
             )
         else:
-            report.report_failure(
+            report.failure(
                 title="Permission denied",
                 message="Failed to extract metadata due to insufficient permissions.",
                 exc=e,
             )
     else:
-        report.report_failure(
+        report.failure(
             title="Failed to extract some metadata",
             message="Failed to extract some metadata from Redshift.",
             exc=e,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
index f7fad574f7fbe..b18b526ef30fc 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py
@@ -569,8 +569,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str:
 
         end_time_str: str = end_time.strftime(redshift_datetime_format)
 
-        return rf"""\
--- DataHub Redshift Source temp table DDL query
+        return rf"""-- DataHub Redshift Source temp table DDL query
 select
     *
 from (
@@ -645,7 +644,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str:
 )
 where
     rn = 1
-            """
+"""
 
     # Add this join to the sql query for more metrics on completed queries
     # LEFT JOIN svl_query_metrics_summary sqms ON ss.query = sqms.query
diff --git a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
index 06b592d42914b..10c0250e37e37 100644
--- a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
+++ b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py
@@ -56,7 +56,7 @@ def mock_stl_insert_table_cursor(cursor: MagicMock) -> None:
 
 query_vs_cursor_mocker = {
     (
-        "\\\n-- DataHub Redshift Source temp table DDL query\nselect\n    *\nfrom (\n    select\n        session_id,\n        transaction_id,\n        start_time,\n        userid,\n        REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n        query_text,\n        row_number() over (\n            partition by session_id, TRIM(query_text)\n            order by start_time desc\n        ) rn\n    from (\n        select\n            pid as session_id,\n            xid as transaction_id,\n            starttime as start_time,\n            type,\n            query_text,\n            userid\n        from (\n            select\n                starttime,\n                pid,\n                xid,\n                type,\n                userid,\n                LISTAGG(case\n                    when LEN(RTRIM(text)) = 0 then text\n                    else RTRIM(text)\n                end,\n                '') within group (\n                    order by sequence\n                ) as query_text\n            from\n                SVL_STATEMENTTEXT\n            where\n                type in ('DDL', 'QUERY')\n                AND        starttime >= '2024-01-01 12:00:00'\n                AND        starttime < '2024-01-10 12:00:00'\n                AND sequence < 290\n            group by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n            order by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n                asc\n        )\n        where\n            type in ('DDL', 'QUERY')\n    )\n    where\n        (create_command ilike 'create temp table %'\n            or create_command ilike 'create temporary table %'\n            -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n            or create_command ilike 'create table %')\n        -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n        and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n        and create_command not like 'CREATE TEMP TABLE volt_tt_'\n        -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n        and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n    rn = 1\n            "
+        "-- DataHub Redshift Source temp table DDL query\nselect\n    *\nfrom (\n    select\n        session_id,\n        transaction_id,\n        start_time,\n        userid,\n        REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n        query_text,\n        row_number() over (\n            partition by session_id, TRIM(query_text)\n            order by start_time desc\n        ) rn\n    from (\n        select\n            pid as session_id,\n            xid as transaction_id,\n            starttime as start_time,\n            type,\n            query_text,\n            userid\n        from (\n            select\n                starttime,\n                pid,\n                xid,\n                type,\n                userid,\n                LISTAGG(case\n                    when LEN(RTRIM(text)) = 0 then text\n                    else RTRIM(text)\n                end,\n                '') within group (\n                    order by sequence\n                ) as query_text\n            from\n                SVL_STATEMENTTEXT\n            where\n                type in ('DDL', 'QUERY')\n                AND        starttime >= '2024-01-01 12:00:00'\n                AND        starttime < '2024-01-10 12:00:00'\n                AND sequence < 290\n            group by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n            order by\n                starttime,\n                pid,\n                xid,\n                type,\n                userid\n                asc\n        )\n        where\n            type in ('DDL', 'QUERY')\n    )\n    where\n        (create_command ilike 'create temp table %'\n            or create_command ilike 'create temporary table %'\n            -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n            or create_command ilike 'create table %')\n        -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n        and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n        and create_command not like 'CREATE TEMP TABLE volt_tt_'\n        -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n        and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n    rn = 1\n"
     ): mock_temp_table_cursor,
     "select * from test_collapse_temp_lineage": mock_stl_insert_table_cursor,
 }

From d7ccf3b2c34fa53085678ab635c733edf8d9fc31 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 16:36:18 -0700
Subject: [PATCH 04/17] docs(airflow): add known limitations for automatic
 lineage (#11652)

---
 docs/lineage/airflow.md | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md
index 35f2ff862e695..829c048a8f8e2 100644
--- a/docs/lineage/airflow.md
+++ b/docs/lineage/airflow.md
@@ -132,7 +132,7 @@ conn_id = datahub_rest_default  # or datahub_kafka_default
 ```
 
 | Name                       | Default value        | Description                                                                                                                                                                            |
-|----------------------------|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| -------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | enabled                    | true                 | If the plugin should be enabled.                                                                                                                                                       |
 | conn_id                    | datahub_rest_default | The name of the datahub connection you set in step 1.                                                                                                                                  |
 | cluster                    | prod                 | name of the airflow cluster                                                                                                                                                            |
@@ -191,6 +191,10 @@ These operators are supported by OpenLineage, but we haven't tested them yet:
 There's also a few operators (e.g. BashOperator, PythonOperator) that have custom extractors, but those extractors don't generate lineage.
 -->
 
+Known limitations:
+
+- We do not fully support operators that run multiple SQL statements at once. In these cases, we'll only capture lineage from the first SQL statement.
+
 ## Manual Lineage Annotation
 
 ### Using `inlets` and `outlets`

From 6e3724b2daaeeebd1895fc43fda4956827112af3 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 17 Oct 2024 17:50:59 -0700
Subject: [PATCH 05/17] perf(ingest): streamline CLL generation (#11645)

---
 metadata-ingestion/setup.py                   |  2 +-
 .../ingestion/source/redshift/lineage.py      |  2 +-
 .../ingestion/source/tableau/tableau.py       |  2 +-
 .../datahub/sql_parsing/sqlglot_lineage.py    | 51 ++++++++++-------
 .../tableau/test_tableau_ingest.py            | 17 ++++--
 ...st_bigquery_subquery_column_inference.json | 57 +++++++++++++++++++
 .../unit/sql_parsing/test_sqlglot_lineage.py  | 15 +++++
 7 files changed, 118 insertions(+), 28 deletions(-)
 create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index bfec2c00cb864..365da21208ecc 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -101,7 +101,7 @@
 sqlglot_lib = {
     # Using an Acryl fork of sqlglot.
     # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1
-    "acryl-sqlglot[rs]==25.20.2.dev6",
+    "acryl-sqlglot[rs]==25.25.2.dev9",
 }
 
 classification_lib = {
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
index 0e4cb6f1599e5..fe491ccb31850 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py
@@ -133,7 +133,7 @@ def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str,
     assert isinstance(parsed_query, sqlglot.exp.Alter)
     prev_name = parsed_query.this.name
     rename_clause = parsed_query.args["actions"][0]
-    assert isinstance(rename_clause, sqlglot.exp.RenameTable)
+    assert isinstance(rename_clause, sqlglot.exp.AlterRename)
     new_name = rename_clause.this.name
 
     schema = parsed_query.this.db or default_schema
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
index 9f011790990ec..2c17a5a322f05 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py
@@ -2131,7 +2131,7 @@ def _create_lineage_from_unsupported_csql(
 
         fine_grained_lineages: List[FineGrainedLineage] = []
         if self.config.extract_column_level_lineage:
-            logger.info("Extracting CLL from custom sql")
+            logger.debug("Extracting CLL from custom sql")
             fine_grained_lineages = make_fine_grained_lineage_class(
                 parsed_result, csql_urn, out_columns
             )
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
index 273e9d0f9f0b1..6a7ff5be6d1ea 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py
@@ -1,6 +1,5 @@
 import dataclasses
 import functools
-import itertools
 import logging
 import traceback
 from collections import defaultdict
@@ -14,6 +13,8 @@
 import sqlglot.optimizer.annotate_types
 import sqlglot.optimizer.optimizer
 import sqlglot.optimizer.qualify
+import sqlglot.optimizer.qualify_columns
+import sqlglot.optimizer.unnest_subqueries
 
 from datahub.cli.env_utils import get_boolean_env_variable
 from datahub.ingestion.graph.client import DataHubGraph
@@ -63,24 +64,30 @@
 SQL_LINEAGE_TIMEOUT_SECONDS = 10
 
 
-RULES_BEFORE_TYPE_ANNOTATION: tuple = tuple(
-    filter(
-        lambda func: func.__name__
-        not in {
-            # Skip pushdown_predicates because it sometimes throws exceptions, and we
-            # don't actually need it for anything.
-            "pushdown_predicates",
-            # Skip normalize because it can sometimes be expensive.
-            "normalize",
-        },
-        itertools.takewhile(
-            lambda func: func != sqlglot.optimizer.annotate_types.annotate_types,
-            sqlglot.optimizer.optimizer.RULES,
-        ),
-    )
+# These rules are a subset of the rules in sqlglot.optimizer.optimizer.RULES.
+# If there's a change in their rules, we probably need to re-evaluate our list as well.
+assert len(sqlglot.optimizer.optimizer.RULES) == 14
+
+_OPTIMIZE_RULES = (
+    sqlglot.optimizer.optimizer.qualify,
+    # We need to enable this in order for annotate types to work.
+    sqlglot.optimizer.optimizer.pushdown_projections,
+    # sqlglot.optimizer.optimizer.normalize,  # causes perf issues
+    sqlglot.optimizer.optimizer.unnest_subqueries,
+    # sqlglot.optimizer.optimizer.pushdown_predicates,  # causes perf issues
+    # sqlglot.optimizer.optimizer.optimize_joins,
+    # sqlglot.optimizer.optimizer.eliminate_subqueries,
+    # sqlglot.optimizer.optimizer.merge_subqueries,
+    # sqlglot.optimizer.optimizer.eliminate_joins,
+    # sqlglot.optimizer.optimizer.eliminate_ctes,
+    sqlglot.optimizer.optimizer.quote_identifiers,
+    # These three are run separately or not run at all.
+    # sqlglot.optimizer.optimizer.annotate_types,
+    # sqlglot.optimizer.canonicalize.canonicalize,
+    # sqlglot.optimizer.simplify.simplify,
 )
-# Quick check that the rules were loaded correctly.
-assert 0 < len(RULES_BEFORE_TYPE_ANNOTATION) < len(sqlglot.optimizer.optimizer.RULES)
+
+_DEBUG_TYPE_ANNOTATIONS = False
 
 
 class _ColumnRef(_FrozenModel):
@@ -385,11 +392,12 @@ def _sqlglot_force_column_normalizer(
                 schema=sqlglot_db_schema,
                 qualify_columns=True,
                 validate_qualify_columns=False,
+                allow_partial_qualification=True,
                 identify=True,
                 # sqlglot calls the db -> schema -> table hierarchy "catalog", "db", "table".
                 catalog=default_db,
                 db=default_schema,
-                rules=RULES_BEFORE_TYPE_ANNOTATION,
+                rules=_OPTIMIZE_RULES,
             )
         except (sqlglot.errors.OptimizeError, ValueError) as e:
             raise SqlUnderstandingError(
@@ -408,6 +416,10 @@ def _sqlglot_force_column_normalizer(
         except (sqlglot.errors.OptimizeError, sqlglot.errors.ParseError) as e:
             # This is not a fatal error, so we can continue.
             logger.debug("sqlglot failed to annotate or parse types: %s", e)
+        if _DEBUG_TYPE_ANNOTATIONS and logger.isEnabledFor(logging.DEBUG):
+            logger.debug(
+                "Type annotated sql %s", statement.sql(pretty=True, dialect=dialect)
+            )
 
     return statement, _ColumnResolver(
         sqlglot_db_schema=sqlglot_db_schema,
@@ -907,6 +919,7 @@ def _sqlglot_lineage_inner(
         # At this stage we only want to qualify the table names. The columns will be dealt with later.
         qualify_columns=False,
         validate_qualify_columns=False,
+        allow_partial_qualification=True,
         # Only insert quotes where necessary.
         identify=False,
     )
diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
index 5a5552a78c56f..3798df07000c8 100644
--- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
+++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -19,6 +19,7 @@
 
 from datahub.configuration.source_common import DEFAULT_ENV
 from datahub.emitter.mce_builder import make_schema_field_urn
+from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.ingestion.run.pipeline import Pipeline, PipelineContext
 from datahub.ingestion.source.tableau.tableau import (
     TableauConfig,
@@ -37,7 +38,7 @@
     FineGrainedLineageUpstreamType,
     UpstreamLineage,
 )
-from datahub.metadata.schema_classes import MetadataChangeProposalClass, UpstreamClass
+from datahub.metadata.schema_classes import UpstreamClass
 from tests.test_helpers import mce_helpers, test_connection_helpers
 from tests.test_helpers.state_helpers import (
     get_current_checkpoint_from_pipeline,
@@ -939,11 +940,12 @@ def test_tableau_unsupported_csql():
         database_override_map={"production database": "prod"}
     )
 
-    def test_lineage_metadata(
+    def check_lineage_metadata(
         lineage, expected_entity_urn, expected_upstream_table, expected_cll
     ):
-        mcp = cast(MetadataChangeProposalClass, next(iter(lineage)).metadata)
-        assert mcp.aspect == UpstreamLineage(
+        mcp = cast(MetadataChangeProposalWrapper, list(lineage)[0].metadata)
+
+        expected = UpstreamLineage(
             upstreams=[
                 UpstreamClass(
                     dataset=expected_upstream_table,
@@ -966,6 +968,9 @@ def test_lineage_metadata(
         )
         assert mcp.entityUrn == expected_entity_urn
 
+        actual_aspect = mcp.aspect
+        assert actual_aspect == expected
+
     csql_urn = "urn:li:dataset:(urn:li:dataPlatform:tableau,09988088-05ad-173c-a2f1-f33ba3a13d1a,PROD)"
     expected_upstream_table = "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.UserDetail,PROD)"
     expected_cll = {
@@ -996,7 +1001,7 @@ def test_lineage_metadata(
         },
         out_columns=[],
     )
-    test_lineage_metadata(
+    check_lineage_metadata(
         lineage=lineage,
         expected_entity_urn=csql_urn,
         expected_upstream_table=expected_upstream_table,
@@ -1014,7 +1019,7 @@ def test_lineage_metadata(
         },
         out_columns=[],
     )
-    test_lineage_metadata(
+    check_lineage_metadata(
         lineage=lineage,
         expected_entity_urn=csql_urn,
         expected_upstream_table=expected_upstream_table,
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json
new file mode 100644
index 0000000000000..32b3830bf1412
--- /dev/null
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json
@@ -0,0 +1,57 @@
+{
+    "query_type": "SELECT",
+    "query_type_props": {},
+    "query_fingerprint": "4094ebd230c1d47c7e6879b05ab927e550923b1986eb58c5f3814396cf401d18",
+    "in_tables": [
+        "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)"
+    ],
+    "out_tables": [],
+    "column_lineage": [
+        {
+            "downstream": {
+                "table": null,
+                "column": "user_id",
+                "column_type": null,
+                "native_column_type": null
+            },
+            "upstreams": [
+                {
+                    "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)",
+                    "column": "user_id"
+                }
+            ]
+        },
+        {
+            "downstream": {
+                "table": null,
+                "column": "source",
+                "column_type": null,
+                "native_column_type": null
+            },
+            "upstreams": [
+                {
+                    "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)",
+                    "column": "source"
+                }
+            ]
+        },
+        {
+            "downstream": {
+                "table": null,
+                "column": "user_source",
+                "column_type": null,
+                "native_column_type": null
+            },
+            "upstreams": [
+                {
+                    "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)",
+                    "column": "user_source"
+                }
+            ]
+        }
+    ],
+    "debug_info": {
+        "confidence": 0.2,
+        "generalized_statement": "SELECT user_id, source, user_source FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY __partition_day DESC) AS rank_ FROM invent_dw.UserDetail) AS source_user WHERE rank_ = ?"
+    }
+}
\ No newline at end of file
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
index fb1d2a0bc5011..90cc863d6bd23 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
@@ -1253,3 +1253,18 @@ def test_snowflake_drop_schema() -> None:
         dialect="snowflake",
         expected_file=RESOURCE_DIR / "test_snowflake_drop_schema.json",
     )
+
+
+def test_bigquery_subquery_column_inference() -> None:
+    assert_sql_result(
+        """\
+SELECT user_id, source, user_source
+FROM (
+    SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY __partition_day DESC) AS rank_
+    FROM invent_dw.UserDetail
+) source_user
+WHERE rank_ = 1
+""",
+        dialect="bigquery",
+        expected_file=RESOURCE_DIR / "test_bigquery_subquery_column_inference.json",
+    )

From 179a6714a619a4990c9845eb1997649d94ab53cb Mon Sep 17 00:00:00 2001
From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
Date: Fri, 18 Oct 2024 10:33:28 +0530
Subject: [PATCH 06/17] feat(ingest): ensure sqlite file delete on clean exit
 (#11612)

---
 .../ingestion/source/bigquery_v2/bigquery.py  |  9 +-
 .../source/bigquery_v2/bigquery_queries.py    |  4 +
 .../source/bigquery_v2/queries_extractor.py   | 18 +++-
 .../ingestion/source/redshift/lineage_v2.py   |  7 +-
 .../ingestion/source/redshift/redshift.py     | 23 +++--
 .../source/snowflake/snowflake_connection.py  |  3 +-
 .../source/snowflake/snowflake_queries.py     | 64 ++++++++-----
 .../source/snowflake/snowflake_v2.py          | 94 ++++++++++---------
 .../sql_parsing/sql_parsing_aggregator.py     |  7 ++
 .../bigquery_v2/test_bigquery_queries.py      | 16 ++++
 .../snowflake/test_snowflake_queries.py       | 24 +++++
 .../unit/sql_parsing/test_sql_aggregator.py   | 22 +++++
 12 files changed, 203 insertions(+), 88 deletions(-)
 create mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py

diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index 3c6202cc7cbfa..0e986acc81add 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -272,7 +272,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
             self.report.set_ingestion_stage("*", QUERIES_EXTRACTION)
 
-            queries_extractor = BigQueryQueriesExtractor(
+            with BigQueryQueriesExtractor(
                 connection=self.config.get_bigquery_client(),
                 schema_api=self.bq_schema_extractor.schema_api,
                 config=BigQueryQueriesExtractorConfig(
@@ -288,9 +288,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
                 identifiers=self.identifiers,
                 schema_resolver=self.sql_parser_schema_resolver,
                 discovered_tables=self.bq_schema_extractor.table_refs,
-            )
-            self.report.queries_extractor = queries_extractor.report
-            yield from queries_extractor.get_workunits_internal()
+            ) as queries_extractor:
+                self.report.queries_extractor = queries_extractor.report
+                yield from queries_extractor.get_workunits_internal()
+
         else:
             if self.config.include_usage_statistics:
                 yield from self.usage_extractor.get_usage_workunits(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py
index ed27aae19ce96..47f21c9f32353 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py
@@ -88,3 +88,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
     def get_report(self) -> BigQueryQueriesSourceReport:
         return self.report
+
+    def close(self) -> None:
+        self.queries_extractor.close()
+        self.connection.close()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py
index b4a443673b9a9..afaaaf51964f8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py
@@ -13,6 +13,7 @@
     BaseTimeWindowConfig,
     get_time_bucket,
 )
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.source import SourceReport
 from datahub.ingestion.api.source_helpers import auto_workunit
 from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -114,7 +115,7 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig):
     )
 
 
-class BigQueryQueriesExtractor:
+class BigQueryQueriesExtractor(Closeable):
     """
     Extracts query audit log and generates usage/lineage/operation workunits.
 
@@ -181,6 +182,7 @@ def __init__(
             is_allowed_table=self.is_allowed_table,
             format_queries=False,
         )
+
         self.report.sql_aggregator = self.aggregator.report
         self.report.num_discovered_tables = (
             len(self.discovered_tables) if self.discovered_tables else None
@@ -273,12 +275,14 @@ def get_workunits_internal(
             self.report.num_unique_queries = len(queries_deduped)
             logger.info(f"Found {self.report.num_unique_queries} unique queries")
 
-        with self.report.audit_log_load_timer:
+        with self.report.audit_log_load_timer, queries_deduped:
             i = 0
             for _, query_instances in queries_deduped.items():
                 for query in query_instances.values():
                     if i > 0 and i % 10000 == 0:
-                        logger.info(f"Added {i} query log entries to SQL aggregator")
+                        logger.info(
+                            f"Added {i} query log equeries_dedupedntries to SQL aggregator"
+                        )
                         if self.report.sql_aggregator:
                             logger.info(self.report.sql_aggregator.as_string())
 
@@ -287,6 +291,11 @@ def get_workunits_internal(
 
         yield from auto_workunit(self.aggregator.gen_metadata())
 
+        if not use_cached_audit_log:
+            queries.close()
+            shared_connection.close()
+            audit_log_file.unlink(missing_ok=True)
+
     def deduplicate_queries(
         self, queries: FileBackedList[ObservedQuery]
     ) -> FileBackedDict[Dict[int, ObservedQuery]]:
@@ -404,6 +413,9 @@ def _parse_audit_log_row(self, row: BigQueryJob) -> ObservedQuery:
 
         return entry
 
+    def close(self) -> None:
+        self.aggregator.close()
+
 
 def _extract_query_text(row: BigQueryJob) -> str:
     # We wrap select statements in a CTE to make them parseable as DML statement.
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
index 4df64c80bad8a..53f9383ec02a7 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py
@@ -5,6 +5,7 @@
 import redshift_connector
 
 from datahub.emitter import mce_builder
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.workunit import MetadataWorkUnit
 from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig
@@ -39,7 +40,7 @@
 logger = logging.getLogger(__name__)
 
 
-class RedshiftSqlLineageV2:
+class RedshiftSqlLineageV2(Closeable):
     # does lineage and usage based on SQL parsing.
 
     def __init__(
@@ -56,6 +57,7 @@ def __init__(
         self.context = context
 
         self.database = database
+
         self.aggregator = SqlParsingAggregator(
             platform=self.platform,
             platform_instance=self.config.platform_instance,
@@ -436,3 +438,6 @@ def generate(self) -> Iterable[MetadataWorkUnit]:
                 message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.",
                 context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}",
             )
+
+    def close(self) -> None:
+        self.aggregator.close()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
index a9fc9ab8f3e99..76030cea98494 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py
@@ -451,24 +451,23 @@ def _extract_metadata(
         )
 
         if self.config.use_lineage_v2:
-            lineage_extractor = RedshiftSqlLineageV2(
+            with RedshiftSqlLineageV2(
                 config=self.config,
                 report=self.report,
                 context=self.ctx,
                 database=database,
                 redundant_run_skip_handler=self.redundant_lineage_run_skip_handler,
-            )
-
-            yield from lineage_extractor.aggregator.register_schemas_from_stream(
-                self.process_schemas(connection, database)
-            )
+            ) as lineage_extractor:
+                yield from lineage_extractor.aggregator.register_schemas_from_stream(
+                    self.process_schemas(connection, database)
+                )
 
-            self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
-            yield from self.extract_lineage_v2(
-                connection=connection,
-                database=database,
-                lineage_extractor=lineage_extractor,
-            )
+                self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION)
+                yield from self.extract_lineage_v2(
+                    connection=connection,
+                    database=database,
+                    lineage_extractor=lineage_extractor,
+                )
 
             all_tables = self.get_all_tables()
         else:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py
index d39e95a884dbc..a9f454cfd3cdb 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py
@@ -18,6 +18,7 @@
 from datahub.configuration.connection_resolver import auto_connection_resolver
 from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider
 from datahub.configuration.validate_field_rename import pydantic_renamed_field
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.source.snowflake.constants import (
     CLIENT_PREFETCH_THREADS,
     CLIENT_SESSION_KEEP_ALIVE,
@@ -364,7 +365,7 @@ def get_connection(self) -> "SnowflakeConnection":
             ) from e
 
 
-class SnowflakeConnection:
+class SnowflakeConnection(Closeable):
     _connection: NativeSnowflakeConnection
 
     def __init__(self, connection: NativeSnowflakeConnection):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py
index 1445d02aa49db..e11073d77b46e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py
@@ -1,3 +1,4 @@
+import contextlib
 import dataclasses
 import functools
 import json
@@ -17,6 +18,7 @@
     BaseTimeWindowConfig,
     BucketDuration,
 )
+from datahub.ingestion.api.closeable import Closeable
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.report import Report
 from datahub.ingestion.api.source import Source, SourceReport
@@ -121,7 +123,7 @@ class SnowflakeQueriesSourceReport(SourceReport):
     queries_extractor: Optional[SnowflakeQueriesExtractorReport] = None
 
 
-class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin):
+class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable):
     def __init__(
         self,
         connection: SnowflakeConnection,
@@ -143,28 +145,33 @@ def __init__(
 
         self._structured_report = structured_report
 
-        self.aggregator = SqlParsingAggregator(
-            platform=self.identifiers.platform,
-            platform_instance=self.identifiers.identifier_config.platform_instance,
-            env=self.identifiers.identifier_config.env,
-            schema_resolver=schema_resolver,
-            graph=graph,
-            eager_graph_load=False,
-            generate_lineage=self.config.include_lineage,
-            generate_queries=self.config.include_queries,
-            generate_usage_statistics=self.config.include_usage_statistics,
-            generate_query_usage_statistics=self.config.include_query_usage_statistics,
-            usage_config=BaseUsageConfig(
-                bucket_duration=self.config.window.bucket_duration,
-                start_time=self.config.window.start_time,
-                end_time=self.config.window.end_time,
-                user_email_pattern=self.config.user_email_pattern,
-                # TODO make the rest of the fields configurable
-            ),
-            generate_operations=self.config.include_operations,
-            is_temp_table=self.is_temp_table,
-            is_allowed_table=self.is_allowed_table,
-            format_queries=False,
+        # The exit stack helps ensure that we close all the resources we open.
+        self._exit_stack = contextlib.ExitStack()
+
+        self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context(
+            SqlParsingAggregator(
+                platform=self.identifiers.platform,
+                platform_instance=self.identifiers.identifier_config.platform_instance,
+                env=self.identifiers.identifier_config.env,
+                schema_resolver=schema_resolver,
+                graph=graph,
+                eager_graph_load=False,
+                generate_lineage=self.config.include_lineage,
+                generate_queries=self.config.include_queries,
+                generate_usage_statistics=self.config.include_usage_statistics,
+                generate_query_usage_statistics=self.config.include_query_usage_statistics,
+                usage_config=BaseUsageConfig(
+                    bucket_duration=self.config.window.bucket_duration,
+                    start_time=self.config.window.start_time,
+                    end_time=self.config.window.end_time,
+                    user_email_pattern=self.config.user_email_pattern,
+                    # TODO make the rest of the fields configurable
+                ),
+                generate_operations=self.config.include_operations,
+                is_temp_table=self.is_temp_table,
+                is_allowed_table=self.is_allowed_table,
+                format_queries=False,
+            )
         )
         self.report.sql_aggregator = self.aggregator.report
 
@@ -248,6 +255,10 @@ def get_workunits_internal(
                 self.aggregator.add(query)
 
         yield from auto_workunit(self.aggregator.gen_metadata())
+        if not use_cached_audit_log:
+            queries.close()
+            shared_connection.close()
+            audit_log_file.unlink(missing_ok=True)
 
     def fetch_copy_history(self) -> Iterable[KnownLineageMapping]:
         # Derived from _populate_external_lineage_from_copy_history.
@@ -426,6 +437,9 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery:
         )
         return entry
 
+    def close(self) -> None:
+        self._exit_stack.close()
+
 
 class SnowflakeQueriesSource(Source):
     def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig):
@@ -468,6 +482,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
     def get_report(self) -> SnowflakeQueriesSourceReport:
         return self.report
 
+    def close(self) -> None:
+        self.connection.close()
+        self.queries_extractor.close()
+
 
 # Make sure we don't try to generate too much info for a single query.
 _MAX_TABLES_PER_QUERY = 20
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
index 0d7881f36554d..dd7f73268fdc4 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -1,3 +1,4 @@
+import contextlib
 import functools
 import json
 import logging
@@ -149,7 +150,12 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
                 cached_domains=[k for k in self.config.domain], graph=self.ctx.graph
             )
 
-        self.connection = self.config.get_connection()
+        # The exit stack helps ensure that we close all the resources we open.
+        self._exit_stack = contextlib.ExitStack()
+
+        self.connection: SnowflakeConnection = self._exit_stack.enter_context(
+            self.config.get_connection()
+        )
 
         # For database, schema, tables, views, etc
         self.data_dictionary = SnowflakeDataDictionary(connection=self.connection)
@@ -157,25 +163,27 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
         self.aggregator: Optional[SqlParsingAggregator] = None
 
         if self.config.use_queries_v2 or self.config.include_table_lineage:
-            self.aggregator = SqlParsingAggregator(
-                platform=self.identifiers.platform,
-                platform_instance=self.config.platform_instance,
-                env=self.config.env,
-                graph=self.ctx.graph,
-                eager_graph_load=(
-                    # If we're ingestion schema metadata for tables/views, then we will populate
-                    # schemas into the resolver as we go. We only need to do a bulk fetch
-                    # if we're not ingesting schema metadata as part of ingestion.
-                    not (
-                        self.config.include_technical_schema
-                        and self.config.include_tables
-                        and self.config.include_views
-                    )
-                    and not self.config.lazy_schema_resolver
-                ),
-                generate_usage_statistics=False,
-                generate_operations=False,
-                format_queries=self.config.format_sql_queries,
+            self.aggregator = self._exit_stack.enter_context(
+                SqlParsingAggregator(
+                    platform=self.identifiers.platform,
+                    platform_instance=self.config.platform_instance,
+                    env=self.config.env,
+                    graph=self.ctx.graph,
+                    eager_graph_load=(
+                        # If we're ingestion schema metadata for tables/views, then we will populate
+                        # schemas into the resolver as we go. We only need to do a bulk fetch
+                        # if we're not ingesting schema metadata as part of ingestion.
+                        not (
+                            self.config.include_technical_schema
+                            and self.config.include_tables
+                            and self.config.include_views
+                        )
+                        and not self.config.lazy_schema_resolver
+                    ),
+                    generate_usage_statistics=False,
+                    generate_operations=False,
+                    format_queries=self.config.format_sql_queries,
+                )
             )
             self.report.sql_aggregator = self.aggregator.report
 
@@ -191,14 +199,16 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
                     pipeline_name=self.ctx.pipeline_name,
                     run_id=self.ctx.run_id,
                 )
-            self.lineage_extractor = SnowflakeLineageExtractor(
-                config,
-                self.report,
-                connection=self.connection,
-                filters=self.filters,
-                identifiers=self.identifiers,
-                redundant_run_skip_handler=redundant_lineage_run_skip_handler,
-                sql_aggregator=self.aggregator,
+            self.lineage_extractor = self._exit_stack.enter_context(
+                SnowflakeLineageExtractor(
+                    config,
+                    self.report,
+                    connection=self.connection,
+                    filters=self.filters,
+                    identifiers=self.identifiers,
+                    redundant_run_skip_handler=redundant_lineage_run_skip_handler,
+                    sql_aggregator=self.aggregator,
+                )
             )
 
         self.usage_extractor: Optional[SnowflakeUsageExtractor] = None
@@ -213,13 +223,15 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
                     pipeline_name=self.ctx.pipeline_name,
                     run_id=self.ctx.run_id,
                 )
-            self.usage_extractor = SnowflakeUsageExtractor(
-                config,
-                self.report,
-                connection=self.connection,
-                filter=self.filters,
-                identifiers=self.identifiers,
-                redundant_run_skip_handler=redundant_usage_run_skip_handler,
+            self.usage_extractor = self._exit_stack.enter_context(
+                SnowflakeUsageExtractor(
+                    config,
+                    self.report,
+                    connection=self.connection,
+                    filter=self.filters,
+                    identifiers=self.identifiers,
+                    redundant_run_skip_handler=redundant_usage_run_skip_handler,
+                )
             )
 
         self.profiling_state_handler: Optional[ProfilingHandler] = None
@@ -444,10 +456,6 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
         self._snowflake_clear_ocsp_cache()
 
-        self.connection = self.config.get_connection()
-        if self.connection is None:
-            return
-
         self.inspect_session_metadata(self.connection)
 
         snowsight_url_builder = None
@@ -513,7 +521,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
 
             schema_resolver = self.aggregator._schema_resolver
 
-            queries_extractor = SnowflakeQueriesExtractor(
+            queries_extractor: SnowflakeQueriesExtractor = SnowflakeQueriesExtractor(
                 connection=self.connection,
                 config=SnowflakeQueriesExtractorConfig(
                     window=self.config,
@@ -535,6 +543,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
             # it should be pretty straightforward to refactor this and only initialize the aggregator once.
             self.report.queries_extractor = queries_extractor.report
             yield from queries_extractor.get_workunits_internal()
+            queries_extractor.close()
 
         else:
             if self.config.include_table_lineage and self.lineage_extractor:
@@ -723,7 +732,4 @@ def _snowflake_clear_ocsp_cache(self) -> None:
     def close(self) -> None:
         super().close()
         StatefulIngestionSourceBase.close(self)
-        if self.lineage_extractor:
-            self.lineage_extractor.close()
-        if self.usage_extractor:
-            self.usage_extractor.close()
+        self._exit_stack.close()
diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
index 5f2709fe42660..0b7ad14a8c1b4 100644
--- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
+++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py
@@ -275,6 +275,8 @@ class SqlAggregatorReport(Report):
     tool_meta_report: Optional[ToolMetaExtractorReport] = None
 
     def compute_stats(self) -> None:
+        if self._aggregator._closed:
+            return
         self.schema_resolver_count = self._aggregator._schema_resolver.schema_count()
         self.num_unique_query_fingerprints = len(self._aggregator._query_map)
 
@@ -345,6 +347,7 @@ def __init__(
 
         # The exit stack helps ensure that we close all the resources we open.
         self._exit_stack = contextlib.ExitStack()
+        self._closed: bool = False
 
         # Set up the schema resolver.
         self._schema_resolver: SchemaResolver
@@ -456,12 +459,16 @@ def __init__(
                 shared_connection=self._shared_connection,
                 tablename="query_usage_counts",
             )
+            self._exit_stack.push(self._query_usage_counts)
 
         # Tool Extractor
         self._tool_meta_extractor = ToolMetaExtractor()
         self.report.tool_meta_report = self._tool_meta_extractor.report
 
     def close(self) -> None:
+        # Compute stats once before closing connections
+        self.report.compute_stats()
+        self._closed = True
         self._exit_stack.close()
 
     @property
diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py
index 9290100b0c521..ef846f698f156 100644
--- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py
+++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py
@@ -1,4 +1,5 @@
 import json
+import os
 from datetime import datetime
 from pathlib import Path
 from unittest.mock import patch
@@ -6,7 +7,9 @@
 import pytest
 from freezegun import freeze_time
 
+from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.source.bigquery_v2.bigquery_queries import (
+    BigQueryQueriesSource,
     BigQueryQueriesSourceReport,
 )
 from datahub.metadata.urns import CorpUserUrn
@@ -93,3 +96,16 @@ def test_queries_ingestion(project_client, client, pytestconfig, monkeypatch, tm
         output_path=mcp_output_path,
         golden_path=mcp_golden_path,
     )
+
+
+@patch("google.cloud.bigquery.Client")
+@patch("google.cloud.resourcemanager_v3.ProjectsClient")
+def test_source_close_cleans_tmp(projects_client, client, tmp_path):
+    with patch("tempfile.tempdir", str(tmp_path)):
+        source = BigQueryQueriesSource.create(
+            {"project_ids": ["project1"]}, PipelineContext("run-id")
+        )
+        assert len(os.listdir(tmp_path)) > 0
+        # This closes QueriesExtractor which in turn closes SqlParsingAggregator
+        source.close()
+        assert len(os.listdir(tmp_path)) == 0
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py
new file mode 100644
index 0000000000000..82f5691bcee3d
--- /dev/null
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py
@@ -0,0 +1,24 @@
+import os
+from unittest.mock import patch
+
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.source.snowflake.snowflake_queries import SnowflakeQueriesSource
+
+
+@patch("snowflake.connector.connect")
+def test_source_close_cleans_tmp(snowflake_connect, tmp_path):
+    with patch("tempfile.tempdir", str(tmp_path)):
+        source = SnowflakeQueriesSource.create(
+            {
+                "connection": {
+                    "account_id": "ABC12345.ap-south-1.aws",
+                    "username": "TST_USR",
+                    "password": "TST_PWD",
+                }
+            },
+            PipelineContext("run-id"),
+        )
+        assert len(os.listdir(tmp_path)) > 0
+        # This closes QueriesExtractor which in turn closes SqlParsingAggregator
+        source.close()
+        assert len(os.listdir(tmp_path)) == 0
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py
index 0d21936a74d07..849d550ef69c5 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py
@@ -1,5 +1,7 @@
+import os
 import pathlib
 from datetime import datetime, timezone
+from unittest.mock import patch
 
 import pytest
 from freezegun import freeze_time
@@ -661,3 +663,23 @@ def test_basic_usage(pytestconfig: pytest.Config) -> None:
         outputs=mcps,
         golden_path=RESOURCE_DIR / "test_basic_usage.json",
     )
+
+
+def test_sql_aggreator_close_cleans_tmp(tmp_path):
+    frozen_timestamp = parse_user_datetime(FROZEN_TIME)
+    with patch("tempfile.tempdir", str(tmp_path)):
+        aggregator = SqlParsingAggregator(
+            platform="redshift",
+            generate_lineage=False,
+            generate_usage_statistics=True,
+            generate_operations=False,
+            usage_config=BaseUsageConfig(
+                start_time=get_time_bucket(frozen_timestamp, BucketDuration.DAY),
+                end_time=frozen_timestamp,
+            ),
+            generate_queries=True,
+            generate_query_usage_statistics=True,
+        )
+        assert len(os.listdir(tmp_path)) > 0
+        aggregator.close()
+        assert len(os.listdir(tmp_path)) == 0

From 011e5b97e7c0f0d330edae59368cc5f8c8512126 Mon Sep 17 00:00:00 2001
From: Shirshanka Das <shirshanka@apache.org>
Date: Fri, 18 Oct 2024 01:12:05 -0700
Subject: [PATCH 07/17] =?UTF-8?q?fix(sdk):=20platform=20resource=20-=20sup?=
 =?UTF-8?q?port=20indexed=20queries=20when=20urns=20are=20i=E2=80=A6=20(#1?=
 =?UTF-8?q?1660)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .../platformresource/platform_resource.py     | 41 +++++++++-
 .../test_platform_resource.py                 | 74 +++++++++++++++++++
 2 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
index 1556a67a9e555..349b0ff11d84f 100644
--- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
+++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
@@ -85,6 +85,7 @@ def scroll_urns_by_filter(
         self,
         entity_type: str,
         extra_or_filters: List[Dict[str, str]],
+        extra_and_filters: List[Dict[str, str]] = [],
     ) -> Iterable[str]:
         """
         Scroll through all urns that match the given filters
@@ -92,10 +93,26 @@ def scroll_urns_by_filter(
 
         key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type)
         assert key_aspect, f"No key aspect found for entity type {entity_type}"
+        if extra_or_filters and extra_and_filters:
+            raise ValueError(
+                "Only one of extra_or_filters and extra_and_filters should be provided"
+            )
 
         count = 1000
-        query = " OR ".join(
-            [f"{filter['field']}:{filter['value']}" for filter in extra_or_filters]
+        query = (
+            " OR ".join(
+                [
+                    f"{filter['field']}:\"{filter['value']}\""
+                    for filter in extra_or_filters
+                ]
+            )
+            if extra_or_filters
+            else " AND ".join(
+                [
+                    f"{filter['field']}:\"{filter['value']}\""
+                    for filter in extra_and_filters
+                ]
+            )
         )
         scroll_id = None
         while True:
@@ -252,3 +269,23 @@ def search_by_key(
 
     def delete(self, graph_client: DataHubGraph, hard: bool = True) -> None:
         graph_client.delete_entity(str(PlatformResourceUrn(self.id)), hard=hard)
+
+    @staticmethod
+    def search_by_filters(
+        graph_client: DataHubGraph,
+        and_filters: List[Dict[str, str]] = [],
+        or_filters: List[Dict[str, str]] = [],
+    ) -> Iterable["PlatformResource"]:
+        if and_filters and or_filters:
+            raise ValueError(
+                "Only one of and_filters and or_filters should be provided"
+            )
+        openapi_client = OpenAPIGraphClient(graph_client)
+        for urn in openapi_client.scroll_urns_by_filter(
+            entity_type="platformResource",
+            extra_or_filters=or_filters if or_filters else [],
+            extra_and_filters=and_filters if and_filters else [],
+        ):
+            platform_resource = PlatformResource.from_datahub(graph_client, urn)
+            if platform_resource:
+                yield platform_resource
diff --git a/smoke-test/tests/platform_resources/test_platform_resource.py b/smoke-test/tests/platform_resources/test_platform_resource.py
index 7c53f72d843c9..7ebfd4d6ea15b 100644
--- a/smoke-test/tests/platform_resources/test_platform_resource.py
+++ b/smoke-test/tests/platform_resources/test_platform_resource.py
@@ -112,3 +112,77 @@ def test_platform_resource_non_existent(graph_client, test_id):
         graph_client=graph_client,
     )
     assert platform_resource is None
+
+
+def test_platform_resource_urn_secondary_key(graph_client, test_id):
+    key = PlatformResourceKey(
+        platform=f"test_platform_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_{test_id}",
+    )
+    dataset_urn = (
+        f"urn:li:dataset:(urn:li:dataPlatform:test,test_secondary_key_{test_id},PROD)"
+    )
+    platform_resource = PlatformResource.create(
+        key=key,
+        value={"test_key": f"test_value_{test_id}"},
+        secondary_keys=[dataset_urn],
+    )
+    platform_resource.to_datahub(graph_client)
+    wait_for_writes_to_sync()
+
+    read_platform_resources = [
+        r
+        for r in PlatformResource.search_by_key(
+            graph_client, dataset_urn, primary=False
+        )
+    ]
+    assert len(read_platform_resources) == 1
+    assert read_platform_resources[0] == platform_resource
+
+
+def test_platform_resource_listing_by_resource_type(graph_client, test_id):
+    # Generate two resources with the same resource type
+    key1 = PlatformResourceKey(
+        platform=f"test_platform_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_1_{test_id}",
+    )
+    platform_resource1 = PlatformResource.create(
+        key=key1,
+        value={"test_key": f"test_value_1_{test_id}"},
+    )
+    platform_resource1.to_datahub(graph_client)
+
+    key2 = PlatformResourceKey(
+        platform=f"test_platform_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_2_{test_id}",
+    )
+    platform_resource2 = PlatformResource.create(
+        key=key2,
+        value={"test_key": f"test_value_2_{test_id}"},
+    )
+    platform_resource2.to_datahub(graph_client)
+
+    wait_for_writes_to_sync()
+
+    search_results = [
+        r
+        for r in PlatformResource.search_by_filters(
+            graph_client,
+            and_filters=[
+                {
+                    "field": "resourceType",
+                    "condition": "EQUAL",
+                    "value": key1.resource_type,
+                }
+            ],
+        )
+    ]
+    assert len(search_results) == 2
+
+    read_platform_resource_1 = next(r for r in search_results if r.id == key1.id)
+    read_platform_resource_2 = next(r for r in search_results if r.id == key2.id)
+    assert read_platform_resource_1 == platform_resource1
+    assert read_platform_resource_2 == platform_resource2

From dcf4793c3d8fb192b7ae3e3248f50445d59e8912 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Fri, 18 Oct 2024 03:13:18 -0700
Subject: [PATCH 08/17] fix(ingest/dbt): Prevent lineage cycles when parsing
 sql of dbt models (#11666)

---
 .../ingestion/source/dbt/dbt_common.py        |  6 +++
 .../tests/unit/test_dbt_source.py             | 42 +++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
index 4cd3c934ce634..c95d0e545c598 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py
@@ -1989,6 +1989,11 @@ def _translate_dbt_name_to_upstream_urn(dbt_name: str) -> str:
                 time=mce_builder.get_sys_time(),
                 actor=_DEFAULT_ACTOR,
             )
+            sibling_urn = node.get_urn(
+                self.config.target_platform,
+                self.config.env,
+                self.config.target_platform_instance,
+            )
             return UpstreamLineageClass(
                 upstreams=[
                     UpstreamClass(
@@ -1997,6 +2002,7 @@ def _translate_dbt_name_to_upstream_urn(dbt_name: str) -> str:
                         auditStamp=auditStamp,
                     )
                     for upstream in upstream_urns
+                    if not (node.node_type == "model" and upstream == sibling_urn)
                 ],
                 fineGrainedLineages=(
                     (cll or None) if self.config.include_column_lineage else None
diff --git a/metadata-ingestion/tests/unit/test_dbt_source.py b/metadata-ingestion/tests/unit/test_dbt_source.py
index 7d01ecd034523..f0d4c3408271f 100644
--- a/metadata-ingestion/tests/unit/test_dbt_source.py
+++ b/metadata-ingestion/tests/unit/test_dbt_source.py
@@ -10,6 +10,7 @@
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.source.dbt import dbt_cloud
 from datahub.ingestion.source.dbt.dbt_cloud import DBTCloudConfig
+from datahub.ingestion.source.dbt.dbt_common import DBTNode
 from datahub.ingestion.source.dbt.dbt_core import (
     DBTCoreConfig,
     DBTCoreSource,
@@ -253,6 +254,47 @@ def test_dbt_config_prefer_sql_parser_lineage():
     assert config.prefer_sql_parser_lineage is True
 
 
+def test_dbt_prefer_sql_parser_lineage_no_self_reference():
+    ctx = PipelineContext(run_id="test-run-id")
+    config = DBTCoreConfig.parse_obj(
+        {
+            **create_base_dbt_config(),
+            "skip_sources_in_lineage": True,
+            "prefer_sql_parser_lineage": True,
+        }
+    )
+    source: DBTCoreSource = DBTCoreSource(config, ctx, "dbt")
+    all_nodes_map = {
+        "model1": DBTNode(
+            name="model1",
+            database=None,
+            schema=None,
+            alias=None,
+            comment="",
+            description="",
+            language=None,
+            raw_code=None,
+            dbt_adapter="postgres",
+            dbt_name="model1",
+            dbt_file_path=None,
+            dbt_package_name=None,
+            node_type="model",
+            materialization="table",
+            max_loaded_at=None,
+            catalog_type=None,
+            missing_from_catalog=False,
+            owner=None,
+            compiled_code="SELECT d FROM results WHERE d > (SELECT MAX(d) FROM model1)",
+        ),
+    }
+    source._infer_schemas_and_update_cll(all_nodes_map)
+    upstream_lineage = source._create_lineage_aspect_for_dbt_node(
+        all_nodes_map["model1"], all_nodes_map
+    )
+    assert upstream_lineage is not None
+    assert len(upstream_lineage.upstreams) == 1
+
+
 def test_dbt_s3_config():
     # test missing aws config
     config_dict: dict = {

From ba7a43f530cfe21c179e32c8712eb040b83a4c13 Mon Sep 17 00:00:00 2001
From: Tamas Nemeth <treff7es@gmail.com>
Date: Fri, 18 Oct 2024 20:16:08 +0200
Subject: [PATCH 09/17] fix(ingest/dagster): Fix JobSnapshot import is broken
 (#11672)

---
 .../datahub_dagster_plugin/client/dagster_generator.py    | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py
index a2cf159dd12f6..df123b127e040 100644
--- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py
+++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py
@@ -12,7 +12,13 @@
     TableSchemaMetadataValue,
 )
 from dagster._core.execution.stats import RunStepKeyStatsSnapshot, StepEventStatus
-from dagster._core.snap import JobSnapshot
+
+try:
+    from dagster._core.snap import JobSnapshot  # type: ignore[attr-defined]
+except ImportError:
+    # Import changed since Dagster 1.8.12  to this -> https://github.com/dagster-io/dagster/commit/29a37d1f0260cfd112849633d1096ffc916d6c95
+    from dagster._core.snap import JobSnap as JobSnapshot
+
 from dagster._core.snap.node import OpDefSnap
 from dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatsSnapshot
 from datahub.api.entities.datajob import DataFlow, DataJob

From 72d1236669c4052488f9ff23517f4568edbe6610 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Fri, 18 Oct 2024 12:01:39 -0700
Subject: [PATCH 10/17] feat(ingest/transformer/domain): Add support for on
 conflict do nothing to dataset domain transformers (#11649)

---
 .../docs/transformer/dataset_transformer.md   | 28 +++----
 .../src/datahub/ingestion/graph/client.py     |  1 +
 .../ingestion/transformer/dataset_domain.py   | 41 ++++++----
 .../tests/unit/test_transform_dataset.py      | 76 +++++++++++++++++++
 4 files changed, 119 insertions(+), 27 deletions(-)

diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md
index d48c6d2c1ab5b..66274ce64a8d2 100644
--- a/metadata-ingestion/docs/transformer/dataset_transformer.md
+++ b/metadata-ingestion/docs/transformer/dataset_transformer.md
@@ -122,12 +122,13 @@ transformers:
 ```
 ## Simple Add Dataset ownership 
 ### Config Details
-| Field              | Required | Type         | Default     | Description                                                         |
-|--------------------|----------|--------------|-------------|---------------------------------------------------------------------|
-| `owner_urns`       | ✅        | list[string] |             | List of owner urns.                                                 |
-| `ownership_type`   |          | string       | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) |
-| `replace_existing` |          | boolean      | `false`     | Whether to remove ownership from entity sent by ingestion source.      |
-| `semantics`        |          | enum         | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS.    |
+| Field              | Required | Type         | Default     | Description                                                                                                |
+|--------------------|----------|--------------|-------------|------------------------------------------------------------------------------------------------------------|
+| `owner_urns`       | ✅        | list[string] |             | List of owner urns.                                                                                        |
+| `ownership_type`   |          | string       | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn)                                        |
+| `replace_existing` |          | boolean      | `false`     | Whether to remove ownership from entity sent by ingestion source.                                          |
+| `semantics`        |          | enum         | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS.                                           |
+| `on_conflict`      |          | enum         | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant. |
 
 For transformer behaviour on `replace_existing` and `semantics`, please refer section [Relationship Between replace_existing And semantics](#relationship-between-replace_existing-and-semantics).
 
@@ -191,13 +192,14 @@ transformers:
 
 ## Pattern Add Dataset ownership 
 ### Config Details
-| Field              | Required | Type                 | Default     | Description                                                                             |
-|--------------------|----------|----------------------|-------------|-----------------------------------------------------------------------------------------|
-| `owner_pattern`    | ✅        | map[regx, list[urn]] |             | entity urn with regular expression and list of owners urn apply to matching entity urn. |
-| `ownership_type`   |          | string               | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn)                     |
-| `replace_existing` |          | boolean              | `false`     | Whether to remove owners from entity sent by ingestion source.                          |
-| `semantics`        |          | enum                 | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS.                        |
-| `is_container`     |          | bool    | `false`    | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. |
+| Field              | Required | Type                 | Default     | Description                                                                                                                  |
+|--------------------|----------|----------------------|-------------|------------------------------------------------------------------------------------------------------------------------------|
+| `owner_pattern`    | ✅        | map[regx, list[urn]] |             | entity urn with regular expression and list of owners urn apply to matching entity urn.                                      |
+| `ownership_type`   |          | string               | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn)                                                          |
+| `replace_existing` |          | boolean              | `false`     | Whether to remove owners from entity sent by ingestion source.                                                               |
+| `semantics`        |          | enum                 | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS.                                                             |
+| `is_container`     |          | bool                 | `false`     | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. |
+| `on_conflict`      |          | enum                 | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant.                   |
 
 let’s suppose we’d like to append a series of users who we know to own a different dataset from a data source but aren't detected during normal ingestion. To do so, we can use the `pattern_add_dataset_ownership` module that’s included in the ingestion framework.  This will match the pattern to `urn` of the dataset and assign the respective owners.
 
diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
index e8fae6254ae88..1d2528a24c4e5 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/client.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -351,6 +351,7 @@ def get_tags(self, entity_urn: str) -> Optional[GlobalTagsClass]:
     def get_glossary_terms(self, entity_urn: str) -> Optional[GlossaryTermsClass]:
         return self.get_aspect(entity_urn=entity_urn, aspect_type=GlossaryTermsClass)
 
+    @functools.lru_cache(maxsize=1)
     def get_domain(self, entity_urn: str) -> Optional[DomainsClass]:
         return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainsClass)
 
diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py
index 6a83824815265..6b78b71eaa78e 100644
--- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py
+++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py
@@ -1,6 +1,8 @@
 import logging
+from enum import auto
 from typing import Callable, Dict, List, Optional, Sequence, Union, cast
 
+from datahub.configuration._config_enum import ConfigEnum
 from datahub.configuration.common import (
     ConfigurationError,
     KeyValuePattern,
@@ -23,6 +25,13 @@
 logger = logging.getLogger(__name__)
 
 
+class TransformerOnConflict(ConfigEnum):
+    """Describes the behavior of the transformer when writing an aspect that already exists."""
+
+    DO_UPDATE = auto()  # On conflict, apply the new aspect
+    DO_NOTHING = auto()  # On conflict, do not apply the new aspect
+
+
 class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel):
     get_domains_to_add: Union[
         Callable[[str], DomainsClass],
@@ -32,10 +41,12 @@ class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel):
     _resolve_domain_fn = pydantic_resolve_key("get_domains_to_add")
 
     is_container: bool = False
+    on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE
 
 
 class SimpleDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel):
     domains: List[str]
+    on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE
 
 
 class PatternDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel):
@@ -80,12 +91,13 @@ def get_domain_class(
 
     @staticmethod
     def _merge_with_server_domains(
-        graph: DataHubGraph, urn: str, mce_domain: Optional[DomainsClass]
+        graph: Optional[DataHubGraph], urn: str, mce_domain: Optional[DomainsClass]
     ) -> Optional[DomainsClass]:
         if not mce_domain or not mce_domain.domains:
             # nothing to add, no need to consult server
             return None
 
+        assert graph
         server_domain = graph.get_domain(entity_urn=urn)
         if server_domain:
             # compute patch
@@ -155,7 +167,7 @@ def transform_aspect(
         self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect]
     ) -> Optional[Aspect]:
         in_domain_aspect: DomainsClass = cast(DomainsClass, aspect)
-        domain_aspect = DomainsClass(domains=[])
+        domain_aspect: DomainsClass = DomainsClass(domains=[])
         # Check if we have received existing aspect
         if in_domain_aspect is not None and self.config.replace_existing is False:
             domain_aspect.domains.extend(in_domain_aspect.domains)
@@ -164,16 +176,18 @@ def transform_aspect(
 
         domain_aspect.domains.extend(domain_to_add.domains)
 
-        if self.config.semantics == TransformerSemantics.PATCH:
-            assert self.ctx.graph
-            patch_domain_aspect: Optional[
-                DomainsClass
-            ] = AddDatasetDomain._merge_with_server_domains(
-                self.ctx.graph, entity_urn, domain_aspect
-            )
-            return cast(Optional[Aspect], patch_domain_aspect)
-
-        return cast(Optional[Aspect], domain_aspect)
+        final_aspect: Optional[DomainsClass] = domain_aspect
+        if domain_aspect.domains:
+            if self.config.on_conflict == TransformerOnConflict.DO_NOTHING:
+                assert self.ctx.graph
+                server_domain = self.ctx.graph.get_domain(entity_urn)
+                if server_domain and server_domain.domains:
+                    return None
+            if self.config.semantics == TransformerSemantics.PATCH:
+                final_aspect = AddDatasetDomain._merge_with_server_domains(
+                    self.ctx.graph, entity_urn, domain_aspect
+                )
+        return cast(Optional[Aspect], final_aspect)
 
 
 class SimpleAddDatasetDomain(AddDatasetDomain):
@@ -186,8 +200,7 @@ def __init__(
         domains = AddDatasetDomain.get_domain_class(ctx.graph, config.domains)
         generic_config = AddDatasetDomainSemanticsConfig(
             get_domains_to_add=lambda _: domains,
-            semantics=config.semantics,
-            replace_existing=config.replace_existing,
+            **config.dict(exclude={"domains"}),
         )
         super().__init__(generic_config, ctx)
 
diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py
index 2e2e85b5d1811..4e9a38cb37ae6 100644
--- a/metadata-ingestion/tests/unit/test_transform_dataset.py
+++ b/metadata-ingestion/tests/unit/test_transform_dataset.py
@@ -56,6 +56,7 @@
 from datahub.ingestion.transformer.dataset_domain import (
     PatternAddDatasetDomain,
     SimpleAddDatasetDomain,
+    TransformerOnConflict,
 )
 from datahub.ingestion.transformer.dataset_domain_based_on_tags import (
     DatasetTagDomainMapper,
@@ -2498,6 +2499,81 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass:
     assert server_domain in transformed_aspect.domains
 
 
+def test_simple_add_dataset_domain_on_conflict_do_nothing(
+    pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance
+):
+    acryl_domain = builder.make_domain_urn("acryl.io")
+    datahub_domain = builder.make_domain_urn("datahubproject.io")
+    server_domain = builder.make_domain_urn("test.io")
+
+    pipeline_context = PipelineContext(run_id="transformer_pipe_line")
+    pipeline_context.graph = mock_datahub_graph_instance
+
+    # Return fake aspect to simulate server behaviour
+    def fake_get_domain(entity_urn: str) -> models.DomainsClass:
+        return models.DomainsClass(domains=[server_domain])
+
+    pipeline_context.graph.get_domain = fake_get_domain  # type: ignore
+
+    output = run_dataset_transformer_pipeline(
+        transformer_type=SimpleAddDatasetDomain,
+        aspect=models.DomainsClass(domains=[datahub_domain]),
+        config={
+            "replace_existing": False,
+            "semantics": TransformerSemantics.PATCH,
+            "domains": [acryl_domain],
+            "on_conflict": TransformerOnConflict.DO_NOTHING,
+        },
+        pipeline_context=pipeline_context,
+    )
+
+    assert len(output) == 1
+    assert output[0] is not None
+    assert output[0].record is not None
+    assert isinstance(output[0].record, EndOfStream)
+
+
+def test_simple_add_dataset_domain_on_conflict_do_nothing_no_conflict(
+    pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance
+):
+    acryl_domain = builder.make_domain_urn("acryl.io")
+    datahub_domain = builder.make_domain_urn("datahubproject.io")
+    irrelevant_domain = builder.make_domain_urn("test.io")
+
+    pipeline_context = PipelineContext(run_id="transformer_pipe_line")
+    pipeline_context.graph = mock_datahub_graph_instance
+
+    # Return fake aspect to simulate server behaviour
+    def fake_get_domain(entity_urn: str) -> models.DomainsClass:
+        return models.DomainsClass(domains=[])
+
+    pipeline_context.graph.get_domain = fake_get_domain  # type: ignore
+
+    output = run_dataset_transformer_pipeline(
+        transformer_type=SimpleAddDatasetDomain,
+        aspect=models.DomainsClass(domains=[datahub_domain]),
+        config={
+            "replace_existing": False,
+            "semantics": TransformerSemantics.PATCH,
+            "domains": [acryl_domain],
+            "on_conflict": TransformerOnConflict.DO_NOTHING,
+        },
+        pipeline_context=pipeline_context,
+    )
+
+    assert len(output) == 2
+    assert output[0] is not None
+    assert output[0].record is not None
+    assert isinstance(output[0].record, MetadataChangeProposalWrapper)
+    assert output[0].record.aspect is not None
+    assert isinstance(output[0].record.aspect, models.DomainsClass)
+    transformed_aspect = cast(models.DomainsClass, output[0].record.aspect)
+    assert len(transformed_aspect.domains) == 2
+    assert datahub_domain in transformed_aspect.domains
+    assert acryl_domain in transformed_aspect.domains
+    assert irrelevant_domain not in transformed_aspect.domains
+
+
 def test_pattern_add_dataset_domain_aspect_name(mock_datahub_graph_instance):
     pipeline_context: PipelineContext = PipelineContext(
         run_id="test_simple_add_dataset_domain"

From 7e4d4aba122f3ec22825ef5005f759172be26678 Mon Sep 17 00:00:00 2001
From: Jay Feldman <8128360+feldjay@users.noreply.github.com>
Date: Fri, 18 Oct 2024 15:58:52 -0400
Subject: [PATCH 11/17] fix(ingest/looker): Remove bad imports from
 looker_common (#11663)

---
 .../src/datahub/ingestion/source/looker/looker_common.py        | 2 --
 1 file changed, 2 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
index 3cbb13375229b..317b212f7fa96 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
@@ -929,7 +929,6 @@ def from_api(  # noqa: C901
         reporter: SourceReport,
         source_config: LookerDashboardSourceConfig,
     ) -> Optional["LookerExplore"]:  # noqa: C901
-        from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME
 
         try:
             explore = client.lookml_model_explore(model, explore_name)
@@ -1194,7 +1193,6 @@ def _to_metadata_events(  # noqa: C901
     ) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]:
         # We only generate MCE-s for explores that contain from clauses and do NOT contain joins
         # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph.
-        from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME
 
         dataset_snapshot = DatasetSnapshot(
             urn=self.get_explore_urn(config),

From dfd7293f8ded30f6db5a3a13b045a2c9b5fdf21d Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Fri, 18 Oct 2024 13:05:06 -0700
Subject: [PATCH 12/17] feat(ingest/looker): include project name in
 model/explore properties (#11664)

Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
---
 .../ingestion/source/looker/looker_common.py  | 16 ++--
 .../ingestion/source/looker/looker_source.py  | 41 ++++++----
 .../looker/golden_looker_mces.json            |  7 ++
 .../looker/golden_test_allow_ingest.json      |  4 +
 ...olden_test_external_project_view_mces.json |  4 +
 .../looker/golden_test_file_path_ingest.json  |  4 +
 ...olden_test_folder_path_pattern_ingest.json |  4 +
 .../golden_test_independent_look_ingest.json  | 82 +++++++++++--------
 .../looker/golden_test_ingest.json            |  4 +
 .../looker/golden_test_ingest_joins.json      |  4 +
 .../golden_test_ingest_unaliased_joins.json   |  4 +
 ...en_test_non_personal_independent_look.json |  7 ++
 .../looker_mces_golden_deleted_stateful.json  | 16 ++--
 .../looker/looker_mces_usage_history.json     |  4 +
 14 files changed, 135 insertions(+), 66 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
index 317b212f7fa96..3d1683100474e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
@@ -1205,15 +1205,19 @@ def _to_metadata_events(  # noqa: C901
         dataset_snapshot.aspects.append(browse_paths)
         dataset_snapshot.aspects.append(StatusClass(removed=False))
 
-        custom_properties = {}
-        if self.label is not None:
-            custom_properties["looker.explore.label"] = str(self.label)
-        if self.source_file is not None:
-            custom_properties["looker.explore.file"] = str(self.source_file)
+        custom_properties = {
+            "project": self.project_name,
+            "model": self.model_name,
+            "looker.explore.label": self.label,
+            "looker.explore.name": self.name,
+            "looker.explore.file": self.source_file,
+        }
         dataset_props = DatasetPropertiesClass(
             name=str(self.label) if self.label else LookerUtil._display_name(self.name),
             description=self.description,
-            customProperties=custom_properties,
+            customProperties={
+                k: str(v) for k, v in custom_properties.items() if v is not None
+            },
         )
         dataset_props.externalUrl = self._get_url(base_url)
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
index f269ccf1cd98f..e42ac7b61c177 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
@@ -139,26 +139,21 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase):
     """
 
     platform = "looker"
-    source_config: LookerDashboardSourceConfig
-    reporter: LookerDashboardSourceReport
-    user_registry: LookerUserRegistry
-    reachable_look_registry: Set[
-        str
-    ]  # Keep track of look-id which are reachable from Dashboard
 
     def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext):
         super().__init__(config, ctx)
-        self.source_config = config
-        self.reporter = LookerDashboardSourceReport()
+        self.source_config: LookerDashboardSourceConfig = config
+        self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport()
         self.looker_api: LookerAPI = LookerAPI(self.source_config)
-        self.user_registry = LookerUserRegistry(self.looker_api)
-        self.explore_registry = LookerExploreRegistry(
+        self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api)
+        self.explore_registry: LookerExploreRegistry = LookerExploreRegistry(
             self.looker_api, self.reporter, self.source_config
         )
         self.reporter._looker_explore_registry = self.explore_registry
         self.reporter._looker_api = self.looker_api
 
-        self.reachable_look_registry = set()
+        # Keep track of look-id which are reachable from Dashboard
+        self.reachable_look_registry: Set[str] = set()
 
         # (model, explore) -> list of charts/looks/dashboards that reference this explore
         # The list values are used purely for debugging purposes.
@@ -868,21 +863,31 @@ def _make_explore_metadata_events(
     ) -> Iterable[
         Union[MetadataChangeEvent, MetadataChangeProposalWrapper, MetadataWorkUnit]
     ]:
-        if self.source_config.emit_used_explores_only:
-            explores_to_fetch = list(self.reachable_explores.keys())
-        else:
+        if not self.source_config.emit_used_explores_only:
             explores_to_fetch = list(self.list_all_explores())
+        else:
+            # We don't keep track of project names for each explore right now.
+            # Because project names are just used for a custom property, it's
+            # fine to set them to None.
+            # TODO: Track project names for each explore.
+            explores_to_fetch = [
+                (None, model, explore)
+                for (model, explore) in self.reachable_explores.keys()
+            ]
         explores_to_fetch.sort()
 
         processed_models: List[str] = []
 
-        for model, _ in explores_to_fetch:
+        for project_name, model, _ in explores_to_fetch:
             if model not in processed_models:
                 model_key = gen_model_key(self.source_config, model)
                 yield from gen_containers(
                     container_key=model_key,
                     name=model,
                     sub_types=[BIContainerSubTypes.LOOKML_MODEL],
+                    extra_properties=(
+                        {"project": project_name} if project_name is not None else None
+                    ),
                 )
                 yield MetadataChangeProposalWrapper(
                     entityUrn=model_key.as_urn(),
@@ -896,7 +901,7 @@ def _make_explore_metadata_events(
         self.reporter.total_explores = len(explores_to_fetch)
         for future in BackpressureAwareExecutor.map(
             self.fetch_one_explore,
-            ((model, explore) for (model, explore) in explores_to_fetch),
+            ((model, explore) for (_project, model, explore) in explores_to_fetch),
             max_workers=self.source_config.max_threads,
         ):
             events, explore_id, start_time, end_time = future.result()
@@ -907,7 +912,7 @@ def _make_explore_metadata_events(
                 f"Running time of fetch_one_explore for {explore_id}: {(end_time - start_time).total_seconds()}"
             )
 
-    def list_all_explores(self) -> Iterable[Tuple[str, str]]:
+    def list_all_explores(self) -> Iterable[Tuple[Optional[str], str, str]]:
         # returns a list of (model, explore) tuples
 
         for model in self.looker_api.all_lookml_models():
@@ -916,7 +921,7 @@ def list_all_explores(self) -> Iterable[Tuple[str, str]]:
             for explore in model.explores:
                 if explore.name is None:
                     continue
-                yield (model.name, explore.name)
+                yield (model.project_name, model.name, explore.name)
 
     def fetch_one_explore(
         self, model: str, explore: str
diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
index 5cac7b1bb73b1..a9c445b5986ef 100644
--- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
+++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json
@@ -11,6 +11,7 @@
                         "description": "lorem ipsum",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -440,7 +441,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "bogus data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/bogus data/my_view",
@@ -616,7 +620,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
index 24a738a815cda..af9c62a2a4180 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json
@@ -11,6 +11,7 @@
                         "description": "lorem ipsum",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -282,7 +283,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
index b1460779da4f5..b89bc356b48fd 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json
@@ -202,6 +202,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -520,7 +521,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "looker_hub",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json
index 74400b9b5cc56..810fefd8f6cb8 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json
@@ -202,6 +202,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -520,7 +521,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "looker_hub",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json
index 89241fb52fb63..3d78397f54a23 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json
@@ -287,6 +287,7 @@
                         "description": "third",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -613,7 +614,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
index f178e97e78fa0..5a540e61e768d 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json
@@ -210,6 +210,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -1107,12 +1108,12 @@
 {
     "proposedSnapshot": {
         "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
             "aspects": [
                 {
                     "com.linkedin.pegasus2avro.common.BrowsePaths": {
                         "paths": [
-                            "/Explore/data"
+                            "/Explore/sales_model"
                         ]
                     }
                 },
@@ -1124,10 +1125,13 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "sales_model",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "sales_explore",
                             "looker.explore.file": "test_source_file.lkml"
                         },
-                        "externalUrl": "https://looker.company.com/explore/data/my_view",
+                        "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore",
                         "name": "My Explore View",
                         "description": "lorem ipsum",
                         "tags": []
@@ -1149,7 +1153,7 @@
                 },
                 {
                     "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "my_view",
+                        "schemaName": "sales_explore",
                         "platform": "urn:li:dataPlatform:looker",
                         "version": 0,
                         "created": {
@@ -1204,7 +1208,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "subTypes",
     "aspect": {
@@ -1223,12 +1227,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "embed",
     "aspect": {
         "json": {
-            "renderUrl": "https://looker.company.com/embed/explore/data/my_view"
+            "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore"
         }
     },
     "systemMetadata": {
@@ -1240,12 +1244,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "container",
     "aspect": {
         "json": {
-            "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
+            "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
         }
     },
     "systemMetadata": {
@@ -1257,7 +1261,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "browsePathsV2",
     "aspect": {
@@ -1267,8 +1271,8 @@
                     "id": "Explore"
                 },
                 {
-                    "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42",
-                    "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
+                    "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5",
+                    "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
                 }
             ]
         }
@@ -1283,12 +1287,12 @@
 {
     "proposedSnapshot": {
         "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
             "aspects": [
                 {
                     "com.linkedin.pegasus2avro.common.BrowsePaths": {
                         "paths": [
-                            "/Explore/order_model"
+                            "/Explore/data"
                         ]
                     }
                 },
@@ -1300,10 +1304,13 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
-                        "externalUrl": "https://looker.company.com/explore/order_model/order_explore",
+                        "externalUrl": "https://looker.company.com/explore/data/my_view",
                         "name": "My Explore View",
                         "description": "lorem ipsum",
                         "tags": []
@@ -1325,7 +1332,7 @@
                 },
                 {
                     "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "order_explore",
+                        "schemaName": "my_view",
                         "platform": "urn:li:dataPlatform:looker",
                         "version": 0,
                         "created": {
@@ -1380,7 +1387,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
     "changeType": "UPSERT",
     "aspectName": "subTypes",
     "aspect": {
@@ -1399,12 +1406,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
     "changeType": "UPSERT",
     "aspectName": "embed",
     "aspect": {
         "json": {
-            "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore"
+            "renderUrl": "https://looker.company.com/embed/explore/data/my_view"
         }
     },
     "systemMetadata": {
@@ -1416,12 +1423,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
     "changeType": "UPSERT",
     "aspectName": "container",
     "aspect": {
         "json": {
-            "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
+            "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
         }
     },
     "systemMetadata": {
@@ -1433,7 +1440,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)",
     "changeType": "UPSERT",
     "aspectName": "browsePathsV2",
     "aspect": {
@@ -1443,8 +1450,8 @@
                     "id": "Explore"
                 },
                 {
-                    "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60",
-                    "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
+                    "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42",
+                    "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42"
                 }
             ]
         }
@@ -1459,12 +1466,12 @@
 {
     "proposedSnapshot": {
         "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": {
-            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+            "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
             "aspects": [
                 {
                     "com.linkedin.pegasus2avro.common.BrowsePaths": {
                         "paths": [
-                            "/Explore/sales_model"
+                            "/Explore/order_model"
                         ]
                     }
                 },
@@ -1476,10 +1483,13 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "order_model",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "order_explore",
                             "looker.explore.file": "test_source_file.lkml"
                         },
-                        "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore",
+                        "externalUrl": "https://looker.company.com/explore/order_model/order_explore",
                         "name": "My Explore View",
                         "description": "lorem ipsum",
                         "tags": []
@@ -1501,7 +1511,7 @@
                 },
                 {
                     "com.linkedin.pegasus2avro.schema.SchemaMetadata": {
-                        "schemaName": "sales_explore",
+                        "schemaName": "order_explore",
                         "platform": "urn:li:dataPlatform:looker",
                         "version": 0,
                         "created": {
@@ -1556,7 +1566,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "subTypes",
     "aspect": {
@@ -1575,12 +1585,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "embed",
     "aspect": {
         "json": {
-            "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore"
+            "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore"
         }
     },
     "systemMetadata": {
@@ -1592,12 +1602,12 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "container",
     "aspect": {
         "json": {
-            "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
+            "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
         }
     },
     "systemMetadata": {
@@ -1609,7 +1619,7 @@
 },
 {
     "entityType": "dataset",
-    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)",
+    "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)",
     "changeType": "UPSERT",
     "aspectName": "browsePathsV2",
     "aspect": {
@@ -1619,8 +1629,8 @@
                     "id": "Explore"
                 },
                 {
-                    "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5",
-                    "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5"
+                    "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60",
+                    "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60"
                 }
             ]
         }
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
index d969ef62a96e5..9ac95b8482a47 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json
@@ -229,6 +229,7 @@
                             "urn:li:chart:(looker,ap-south-1.dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -574,7 +575,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
index 153db363c7828..3a2c6359ea63c 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json
@@ -202,6 +202,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -520,7 +521,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
index 98adbdc5b829e..007eee348aeaf 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json
@@ -11,6 +11,7 @@
                         "description": "lorem ipsum",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -282,7 +283,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json
index 63ffdda8c5b6f..859b9163d7aad 100644
--- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json
+++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json
@@ -210,6 +210,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -783,7 +784,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
@@ -959,7 +963,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "sales_model",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "sales_explore",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore",
diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
index 567ab78a14754..8256c984afb27 100644
--- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
+++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json
@@ -210,6 +210,7 @@
                             "urn:li:chart:(looker,dashboard_elements.2)"
                         ],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -539,7 +540,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",
@@ -810,8 +814,8 @@
     }
 },
 {
-    "entityType": "chart",
-    "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)",
+    "entityType": "dashboard",
+    "entityUrn": "urn:li:dashboard:(looker,dashboards.11)",
     "changeType": "UPSERT",
     "aspectName": "status",
     "aspect": {
@@ -827,8 +831,8 @@
     }
 },
 {
-    "entityType": "container",
-    "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7",
+    "entityType": "chart",
+    "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)",
     "changeType": "UPSERT",
     "aspectName": "status",
     "aspect": {
@@ -844,8 +848,8 @@
     }
 },
 {
-    "entityType": "dashboard",
-    "entityUrn": "urn:li:dashboard:(looker,dashboards.11)",
+    "entityType": "container",
+    "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7",
     "changeType": "UPSERT",
     "aspectName": "status",
     "aspect": {
diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
index 3befb62a631de..0b3530f9c2462 100644
--- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
+++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json
@@ -11,6 +11,7 @@
                         "description": "lorem ipsum",
                         "charts": [],
                         "datasets": [],
+                        "dashboards": [],
                         "lastModified": {
                             "created": {
                                 "time": 1586847600000,
@@ -234,7 +235,10 @@
                 {
                     "com.linkedin.pegasus2avro.dataset.DatasetProperties": {
                         "customProperties": {
+                            "project": "lkml_samples",
+                            "model": "data",
                             "looker.explore.label": "My Explore View",
+                            "looker.explore.name": "my_view",
                             "looker.explore.file": "test_source_file.lkml"
                         },
                         "externalUrl": "https://looker.company.com/explore/data/my_view",

From 8f7f2c17242fb084f8784f015f76808bfda9b593 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Fri, 18 Oct 2024 14:29:03 -0700
Subject: [PATCH 13/17] feat(ingest/fivetran): protect against high sync volume
 (#11589)

---
 .../ingestion/source/fivetran/fivetran.py     | 21 ++++---
 .../source/fivetran/fivetran_log_api.py       | 63 ++++++++++---------
 .../source/fivetran/fivetran_query.py         | 34 +++++++---
 .../integration/fivetran/test_fivetran.py     | 58 ++++-------------
 4 files changed, 87 insertions(+), 89 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
index 704a6f20a5c19..334bb58ea84f8 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -27,7 +27,10 @@
     PlatformDetail,
 )
 from datahub.ingestion.source.fivetran.data_classes import Connector, Job
-from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI
+from datahub.ingestion.source.fivetran.fivetran_log_api import (
+    MAX_JOBS_PER_CONNECTOR,
+    FivetranLogAPI,
+)
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
 )
@@ -72,11 +75,6 @@ def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext):
 
         self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
 
-        # Create and register the stateful ingestion use-case handler.
-        self.stale_entity_removal_handler = StaleEntityRemovalHandler.create(
-            self, self.config, self.ctx
-        )
-
     def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None:
         input_dataset_urn_list: List[DatasetUrn] = []
         output_dataset_urn_list: List[DatasetUrn] = []
@@ -267,6 +265,13 @@ def _get_connector_workunits(
             ).as_workunit(is_primary_source=False)
 
         # Map Fivetran's job/sync history entity with Datahub's data process entity
+        if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR:
+            self.report.warning(
+                title="Not all sync history was captured",
+                message=f"The connector had more than {MAX_JOBS_PER_CONNECTOR} sync runs in the past {self.config.history_sync_lookback_period} days. "
+                f"Only the most recent {MAX_JOBS_PER_CONNECTOR} syncs were ingested.",
+                context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
+            )
         for job in connector.jobs:
             dpi = self._generate_dpi_from_job(job, datajob)
             yield from self._get_dpi_workunits(job, dpi)
@@ -279,7 +284,9 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
     def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
         return [
             *super().get_workunit_processors(),
-            self.stale_entity_removal_handler.workunit_processor,
+            StaleEntityRemovalHandler.create(
+                self, self.config, self.ctx
+            ).workunit_processor,
         ]
 
     def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
index 31c16139066e4..5908efe39e2b4 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
@@ -22,6 +22,10 @@
 
 logger: logging.Logger = logging.getLogger(__name__)
 
+# We don't want to generate a massive number of dataProcesses for a single connector.
+# This is primarily used as a safeguard to prevent performance issues.
+MAX_JOBS_PER_CONNECTOR = 1000
+
 
 class FivetranLogAPI:
     def __init__(self, fivetran_log_config: FivetranLogConfig) -> None:
@@ -158,34 +162,32 @@ def _get_table_lineage(
 
         return table_lineage_list
 
-    def _get_all_connector_sync_logs(self, syncs_interval: int) -> Dict[str, Dict]:
-        sync_logs = {}
-        for row in self._query(
-            self.fivetran_log_query.get_sync_logs_query().format(
-                db_clause=self.fivetran_log_query.db_clause,
-                syncs_interval=syncs_interval,
-            )
-        ):
-            if row[Constant.CONNECTOR_ID] not in sync_logs:
-                sync_logs[row[Constant.CONNECTOR_ID]] = {
-                    row[Constant.SYNC_ID]: {
-                        row["message_event"]: (
-                            row[Constant.TIME_STAMP].timestamp(),
-                            row[Constant.MESSAGE_DATA],
-                        )
-                    }
-                }
-            elif row[Constant.SYNC_ID] not in sync_logs[row[Constant.CONNECTOR_ID]]:
-                sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]] = {
-                    row["message_event"]: (
-                        row[Constant.TIME_STAMP].timestamp(),
-                        row[Constant.MESSAGE_DATA],
-                    )
-                }
-            else:
-                sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]][
-                    row["message_event"]
-                ] = (row[Constant.TIME_STAMP].timestamp(), row[Constant.MESSAGE_DATA])
+    def _get_all_connector_sync_logs(
+        self, syncs_interval: int, connector_ids: List[str]
+    ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]:
+        sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {}
+
+        # Format connector_ids as a comma-separated string of quoted IDs
+        formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
+
+        query = self.fivetran_log_query.get_sync_logs_query().format(
+            db_clause=self.fivetran_log_query.db_clause,
+            syncs_interval=syncs_interval,
+            max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR,
+            connector_ids=formatted_connector_ids,
+        )
+
+        for row in self._query(query):
+            connector_id = row[Constant.CONNECTOR_ID]
+            sync_id = row[Constant.SYNC_ID]
+
+            if connector_id not in sync_logs:
+                sync_logs[connector_id] = {}
+
+            sync_logs[connector_id][sync_id] = {
+                "sync_start": (row["start_time"].timestamp(), None),
+                "sync_end": (row["end_time"].timestamp(), row["end_message_data"]),
+            }
 
         return sync_logs
 
@@ -244,7 +246,10 @@ def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None:
     def _fill_connectors_jobs(
         self, connectors: List[Connector], syncs_interval: int
     ) -> None:
-        sync_logs = self._get_all_connector_sync_logs(syncs_interval)
+        connector_ids = [connector.connector_id for connector in connectors]
+        sync_logs = self._get_all_connector_sync_logs(
+            syncs_interval, connector_ids=connector_ids
+        )
         for connector in connectors:
             connector.jobs = self._get_jobs_list(sync_logs.get(connector.connector_id))
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
index d965f53ff554b..c4680b4b1037a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
@@ -37,14 +37,32 @@ def get_users_query(self) -> str:
 
     def get_sync_logs_query(self) -> str:
         return """
-        SELECT connector_id,
-        sync_id,
-        message_event,
-        message_data,
-        time_stamp
-        FROM {db_clause}log
-        WHERE message_event in ('sync_start', 'sync_end')
-        and time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'"""
+        WITH ranked_syncs AS (
+            SELECT
+                connector_id,
+                sync_id,
+                MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
+                MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
+                MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
+                ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
+            FROM {db_clause}log
+            WHERE message_event in ('sync_start', 'sync_end')
+            AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
+            AND connector_id IN ({connector_ids})
+            GROUP BY connector_id, sync_id
+        )
+        SELECT
+            connector_id,
+            sync_id,
+            start_time,
+            end_time,
+            end_message_data
+        FROM ranked_syncs
+        WHERE rn <= {max_jobs_per_connector}
+          AND start_time IS NOT NULL
+          AND end_time IS NOT NULL
+        ORDER BY connector_id, end_time DESC
+        """
 
     def get_table_lineage_query(self) -> str:
         return f"""
diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
index 0f5d098ee39c4..33ac09e69a3c0 100644
--- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
+++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
@@ -101,64 +101,32 @@ def default_query_results(
             }
         ]
     elif query == fivetran_log_query.get_sync_logs_query().format(
-        db_clause=fivetran_log_query.db_clause, syncs_interval=7
+        db_clause=fivetran_log_query.db_clause,
+        syncs_interval=7,
+        max_jobs_per_connector=1000,
+        connector_ids="'calendar_elected'",
     ):
         return [
             {
                 "connector_id": "calendar_elected",
                 "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243",
-                "message_event": "sync_start",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000),
+                "start_time": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000),
+                "end_time": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000),
+                "end_message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"',
             },
             {
                 "connector_id": "calendar_elected",
                 "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834",
-                "message_event": "sync_start",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000),
+                "start_time": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000),
+                "end_time": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000),
+                "end_message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"',
             },
             {
                 "connector_id": "calendar_elected",
                 "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be",
-                "message_event": "sync_start",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832",
-                "message_event": "sync_start",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 5, 403000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243",
-                "message_event": "sync_end",
-                "message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"',
-                "time_stamp": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834",
-                "message_event": "sync_end",
-                "message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"',
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be",
-                "message_event": "sync_end",
-                "message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"',
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000),
-            },
-            {
-                "connector_id": "calendar_elected",
-                "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832",
-                "message_event": "sync_end",
-                "message_data": None,
-                "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 35, 478000),
+                "start_time": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000),
+                "end_time": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000),
+                "end_message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"',
             },
         ]
     # Unreachable code

From 3b1b76244dfc6121e866f9d8803f1491ee6f5cec Mon Sep 17 00:00:00 2001
From: Shirshanka Das <shirshanka@apache.org>
Date: Sat, 19 Oct 2024 14:53:28 -0700
Subject: [PATCH 14/17] feat(sdk):platform-resource - complex queries (#11675)

---
 .../platformresource/platform_resource.py     | 193 ++++++------
 .../src/datahub/utilities/openapi_utils.py    |  69 +++++
 .../src/datahub/utilities/search_utils.py     | 285 ++++++++++++++++++
 .../test_platform_resource.py                 |  15 +
 .../tests/unit/utilities/test_search_utils.py |  71 +++++
 .../test_platform_resource.py                 |  78 ++++-
 6 files changed, 617 insertions(+), 94 deletions(-)
 create mode 100644 metadata-ingestion/src/datahub/utilities/openapi_utils.py
 create mode 100644 metadata-ingestion/src/datahub/utilities/search_utils.py
 create mode 100644 metadata-ingestion/tests/unit/utilities/test_search_utils.py

diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
index 349b0ff11d84f..0f7b10a067053 100644
--- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
+++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py
@@ -1,5 +1,5 @@
 import logging
-from typing import Dict, Iterable, List, Optional, Union
+from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type, Union, cast
 
 from avrogen.dict_wrapper import DictWrapper
 from pydantic import BaseModel
@@ -14,7 +14,14 @@
 from datahub.emitter.mcp import MetadataChangeProposalWrapper
 from datahub.emitter.mcp_builder import DatahubKey
 from datahub.ingestion.graph.client import DataHubGraph
-from datahub.metadata.urns import PlatformResourceUrn
+from datahub.metadata.urns import DataPlatformUrn, PlatformResourceUrn, Urn
+from datahub.utilities.openapi_utils import OpenAPIGraphClient
+from datahub.utilities.search_utils import (
+    ElasticDocumentQuery,
+    ElasticsearchQueryBuilder,
+    LogicalOperator,
+    SearchField,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -69,71 +76,75 @@ def to_resource_info(self) -> models.PlatformResourceInfoClass:
         )
 
 
-class OpenAPIGraphClient:
+class DataPlatformInstanceUrn:
+    """
+    A simple implementation of a URN class for DataPlatformInstance.
+    Since this is not present in the URN registry, we need to implement it here.
+    """
 
-    ENTITY_KEY_ASPECT_MAP = {
-        aspect_type.ASPECT_INFO.get("keyForEntity"): name
-        for name, aspect_type in models.ASPECT_NAME_MAP.items()
-        if aspect_type.ASPECT_INFO.get("keyForEntity")
-    }
+    @staticmethod
+    def create_from_id(platform_instance_urn: str) -> Urn:
+        if platform_instance_urn.startswith("urn:li:platformInstance:"):
+            string_urn = platform_instance_urn
+        else:
+            string_urn = f"urn:li:platformInstance:{platform_instance_urn}"
+        return Urn.from_string(string_urn)
 
-    def __init__(self, graph: DataHubGraph):
-        self.graph = graph
-        self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3"
 
-    def scroll_urns_by_filter(
-        self,
-        entity_type: str,
-        extra_or_filters: List[Dict[str, str]],
-        extra_and_filters: List[Dict[str, str]] = [],
-    ) -> Iterable[str]:
-        """
-        Scroll through all urns that match the given filters
-        """
+class UrnSearchField(SearchField):
+    """
+    A search field that supports URN values.
+    TODO: Move this to search_utils after we make this more generic.
+    """
 
-        key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type)
-        assert key_aspect, f"No key aspect found for entity type {entity_type}"
-        if extra_or_filters and extra_and_filters:
-            raise ValueError(
-                "Only one of extra_or_filters and extra_and_filters should be provided"
-            )
+    def __init__(self, field_name: str, urn_value_extractor: Callable[[str], Urn]):
+        self.urn_value_extractor = urn_value_extractor
+        super().__init__(field_name)
 
-        count = 1000
-        query = (
-            " OR ".join(
-                [
-                    f"{filter['field']}:\"{filter['value']}\""
-                    for filter in extra_or_filters
-                ]
-            )
-            if extra_or_filters
-            else " AND ".join(
-                [
-                    f"{filter['field']}:\"{filter['value']}\""
-                    for filter in extra_and_filters
-                ]
-            )
+    def get_search_value(self, value: str) -> str:
+        return str(self.urn_value_extractor(value))
+
+
+class PlatformResourceSearchField(SearchField):
+    def __init__(self, field_name: str):
+        super().__init__(field_name)
+
+    @classmethod
+    def from_search_field(
+        cls, search_field: SearchField
+    ) -> "PlatformResourceSearchField":
+        # pretends to be a class method, but just returns the input
+        return search_field  # type: ignore
+
+
+class PlatformResourceSearchFields:
+    PRIMARY_KEY = PlatformResourceSearchField("primaryKey")
+    RESOURCE_TYPE = PlatformResourceSearchField("resourceType")
+    SECONDARY_KEYS = PlatformResourceSearchField("secondaryKeys")
+    PLATFORM = PlatformResourceSearchField.from_search_field(
+        UrnSearchField(
+            field_name="platform.keyword",
+            urn_value_extractor=DataPlatformUrn.create_from_id,
         )
-        scroll_id = None
-        while True:
-            response = self.graph._get_generic(
-                self.openapi_base + f"/entity/{entity_type.lower()}",
-                params={
-                    "systemMetadata": "false",
-                    "includeSoftDelete": "false",
-                    "skipCache": "false",
-                    "aspects": [key_aspect],
-                    "scrollId": scroll_id,
-                    "count": count,
-                    "query": query,
-                },
-            )
-            entities = response.get("entities", [])
-            scroll_id = response.get("scrollId")
-            for entity in entities:
-                yield entity["urn"]
-            if not scroll_id:
-                break
+    )
+    PLATFORM_INSTANCE = PlatformResourceSearchField.from_search_field(
+        UrnSearchField(
+            field_name="platformInstance.keyword",
+            urn_value_extractor=DataPlatformInstanceUrn.create_from_id,
+        )
+    )
+
+
+class ElasticPlatformResourceQuery(ElasticDocumentQuery[PlatformResourceSearchField]):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def create_from(
+        cls: Type["ElasticPlatformResourceQuery"],
+        *args: Tuple[Union[str, PlatformResourceSearchField], str],
+    ) -> "ElasticPlatformResourceQuery":
+        return cast(ElasticPlatformResourceQuery, super().create_from(*args))
 
 
 class PlatformResource(BaseModel):
@@ -147,6 +158,12 @@ def remove(
         cls,
         key: PlatformResourceKey,
     ) -> "PlatformResource":
+        """
+        Creates a PlatformResource object with the removed status set to True.
+        Removed PlatformResource objects are used to soft-delete resources from
+        the graph.
+        To hard-delete a resource, use the delete method.
+        """
         return cls(
             id=key.id,
             removed=True,
@@ -240,28 +257,38 @@ def from_datahub(
 
     @staticmethod
     def search_by_key(
-        graph_client: DataHubGraph, key: str, primary: bool = True
+        graph_client: DataHubGraph,
+        key: str,
+        primary: bool = True,
+        is_exact: bool = True,
     ) -> Iterable["PlatformResource"]:
-        extra_or_filters = []
-        extra_or_filters.append(
-            {
-                "field": "primaryKey",
-                "condition": "EQUAL",
-                "value": key,
-            }
+        """
+        Searches for PlatformResource entities by primary or secondary key.
+
+        :param graph_client: DataHubGraph client
+        :param key: The key to search for
+        :param primary: Whether to search for primary only or expand the search
+            to secondary keys (default: True)
+        :param is_exact: Whether to search for an exact match (default: True)
+        :return: An iterable of PlatformResource objects
+        """
+
+        elastic_platform_resource_group = (
+            ElasticPlatformResourceQuery.create_from()
+            .group(LogicalOperator.OR)
+            .add_field_match(
+                PlatformResourceSearchFields.PRIMARY_KEY, key, is_exact=is_exact
+            )
         )
         if not primary:  # we expand the search to secondary keys
-            extra_or_filters.append(
-                {
-                    "field": "secondaryKeys",
-                    "condition": "EQUAL",
-                    "value": key,
-                }
+            elastic_platform_resource_group.add_field_match(
+                PlatformResourceSearchFields.SECONDARY_KEYS, key, is_exact=is_exact
             )
+        query = elastic_platform_resource_group.end()
         openapi_client = OpenAPIGraphClient(graph_client)
         for urn in openapi_client.scroll_urns_by_filter(
             entity_type="platformResource",
-            extra_or_filters=extra_or_filters,
+            query=query,
         ):
             platform_resource = PlatformResource.from_datahub(graph_client, urn)
             if platform_resource:
@@ -273,18 +300,16 @@ def delete(self, graph_client: DataHubGraph, hard: bool = True) -> None:
     @staticmethod
     def search_by_filters(
         graph_client: DataHubGraph,
-        and_filters: List[Dict[str, str]] = [],
-        or_filters: List[Dict[str, str]] = [],
+        query: Union[
+            ElasticPlatformResourceQuery,
+            ElasticDocumentQuery,
+            ElasticsearchQueryBuilder,
+        ],
     ) -> Iterable["PlatformResource"]:
-        if and_filters and or_filters:
-            raise ValueError(
-                "Only one of and_filters and or_filters should be provided"
-            )
         openapi_client = OpenAPIGraphClient(graph_client)
         for urn in openapi_client.scroll_urns_by_filter(
             entity_type="platformResource",
-            extra_or_filters=or_filters if or_filters else [],
-            extra_and_filters=and_filters if and_filters else [],
+            query=query,
         ):
             platform_resource = PlatformResource.from_datahub(graph_client, urn)
             if platform_resource:
diff --git a/metadata-ingestion/src/datahub/utilities/openapi_utils.py b/metadata-ingestion/src/datahub/utilities/openapi_utils.py
new file mode 100644
index 0000000000000..e704ff7f84cbb
--- /dev/null
+++ b/metadata-ingestion/src/datahub/utilities/openapi_utils.py
@@ -0,0 +1,69 @@
+import logging
+from typing import Iterable, Union
+
+import datahub.metadata.schema_classes as models
+from datahub.ingestion.graph.client import DataHubGraph
+from datahub.utilities.search_utils import (
+    ElasticDocumentQuery,
+    ElasticsearchQueryBuilder,
+)
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAPIGraphClient:
+    """
+    An experimental client for the DataHubGraph that uses the OpenAPI endpoints
+    to query entities and aspects.
+    Does not support all features of the DataHubGraph.
+    API is subject to change.
+
+    DO NOT USE THIS UNLESS YOU KNOW WHAT YOU ARE DOING.
+    """
+
+    ENTITY_KEY_ASPECT_MAP = {
+        aspect_type.ASPECT_INFO.get("keyForEntity"): name
+        for name, aspect_type in models.ASPECT_NAME_MAP.items()
+        if aspect_type.ASPECT_INFO.get("keyForEntity")
+    }
+
+    def __init__(self, graph: DataHubGraph):
+        self.graph = graph
+        self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3"
+
+    def scroll_urns_by_filter(
+        self,
+        entity_type: str,
+        query: Union[ElasticDocumentQuery, ElasticsearchQueryBuilder],
+    ) -> Iterable[str]:
+        """
+        Scroll through all urns that match the given filters.
+
+        """
+
+        key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type)
+        assert key_aspect, f"No key aspect found for entity type {entity_type}"
+
+        count = 1000
+        string_query = query.build()
+        scroll_id = None
+        logger.debug(f"Scrolling with query: {string_query}")
+        while True:
+            response = self.graph._get_generic(
+                self.openapi_base + f"/entity/{entity_type.lower()}",
+                params={
+                    "systemMetadata": "false",
+                    "includeSoftDelete": "false",
+                    "skipCache": "false",
+                    "aspects": [key_aspect],
+                    "scrollId": scroll_id,
+                    "count": count,
+                    "query": string_query,
+                },
+            )
+            entities = response.get("entities", [])
+            scroll_id = response.get("scrollId")
+            for entity in entities:
+                yield entity["urn"]
+            if not scroll_id:
+                break
diff --git a/metadata-ingestion/src/datahub/utilities/search_utils.py b/metadata-ingestion/src/datahub/utilities/search_utils.py
new file mode 100644
index 0000000000000..0bd88addd8660
--- /dev/null
+++ b/metadata-ingestion/src/datahub/utilities/search_utils.py
@@ -0,0 +1,285 @@
+import logging
+import re
+from enum import Enum
+from typing import Generic, List, Optional, Tuple, Type, TypeVar, Union
+
+logger = logging.getLogger(__name__)
+
+
+class LogicalOperator(Enum):
+    AND = "AND"
+    OR = "OR"
+
+
+class SearchField:
+    def __init__(self, field_name: str):
+        self.field_name = field_name
+
+    def get_search_value(self, value: str) -> str:
+        return value
+
+    def __str__(self) -> str:
+        return self.field_name
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    @classmethod
+    def from_string_field(cls, field_name: str) -> "SearchField":
+        return cls(field_name)
+
+
+class QueryNode:
+    def __init__(self, operator: Optional[LogicalOperator] = None):
+        self.operator = operator
+        self.children: List[Union[QueryNode, str]] = []
+
+    def add_child(self, child: Union["QueryNode", str]) -> None:
+        self.children.append(child)
+
+    def build(self) -> str:
+        if not self.children:
+            return ""
+
+        if self.operator is None:
+            return (
+                self.children[0]
+                if isinstance(self.children[0], str)
+                else self.children[0].build()
+            )
+
+        child_queries = []
+        for child in self.children:
+            if isinstance(child, str):
+                child_queries.append(child)
+            else:
+                child_queries.append(child.build())
+
+        joined_queries = f" {self.operator.value} ".join(child_queries)
+        return f"({joined_queries})" if len(child_queries) > 1 else joined_queries
+
+
+class ElasticsearchQueryBuilder:
+    SPECIAL_CHARACTERS = r'+-=&|><!(){}[]^"~*?:\/'
+
+    def __init__(self, operator: LogicalOperator = LogicalOperator.AND) -> None:
+        self.root = QueryNode(operator=operator)
+
+    @classmethod
+    def escape_special_characters(cls, value: str) -> str:
+        """
+        Escape special characters in the search term.
+        """
+        return re.sub(f"([{re.escape(cls.SPECIAL_CHARACTERS)}])", r"\\\1", value)
+
+    def _create_term(
+        self, field: SearchField, value: str, is_exact: bool = False
+    ) -> str:
+        escaped_value = self.escape_special_characters(field.get_search_value(value))
+        field_name: str = field.field_name
+        if is_exact:
+            return f'{field_name}:"{escaped_value}"'
+        return f"{field_name}:{escaped_value}"
+
+    def add_field_match(
+        self, field: SearchField, value: str, is_exact: bool = True
+    ) -> "ElasticsearchQueryBuilder":
+        term = self._create_term(field, value, is_exact)
+        self.root.add_child(term)
+        return self
+
+    def add_field_not_match(
+        self, field: SearchField, value: str, is_exact: bool = True
+    ) -> "ElasticsearchQueryBuilder":
+        term = f"-{self._create_term(field, value, is_exact)}"
+        self.root.add_child(term)
+        return self
+
+    def add_range(
+        self,
+        field: str,
+        min_value: Optional[str] = None,
+        max_value: Optional[str] = None,
+        include_min: bool = True,
+        include_max: bool = True,
+    ) -> "ElasticsearchQueryBuilder":
+        min_bracket = "[" if include_min else "{"
+        max_bracket = "]" if include_max else "}"
+        min_val = min_value if min_value is not None else "*"
+        max_val = max_value if max_value is not None else "*"
+        range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}"
+        self.root.add_child(range_query)
+        return self
+
+    def add_wildcard(self, field: str, pattern: str) -> "ElasticsearchQueryBuilder":
+        wildcard_query = f"{field}:{pattern}"
+        self.root.add_child(wildcard_query)
+        return self
+
+    def add_fuzzy(
+        self, field: str, value: str, fuzziness: int = 2
+    ) -> "ElasticsearchQueryBuilder":
+        fuzzy_query = f"{field}:{value}~{fuzziness}"
+        self.root.add_child(fuzzy_query)
+        return self
+
+    def add_boost(
+        self, field: str, value: str, boost: float
+    ) -> "ElasticsearchQueryBuilder":
+        boosted_query = f"{field}:{value}^{boost}"
+        self.root.add_child(boosted_query)
+        return self
+
+    def group(self, operator: LogicalOperator) -> "QueryGroup":
+        return QueryGroup(self, operator)
+
+    def build(self) -> str:
+        return self.root.build()
+
+
+class QueryGroup:
+    def __init__(self, parent: ElasticsearchQueryBuilder, operator: LogicalOperator):
+        self.parent = parent
+        self.node = QueryNode(operator)
+        self.parent.root.add_child(self.node)
+
+    def add_field_match(
+        self, field: Union[str, SearchField], value: str, is_exact: bool = True
+    ) -> "QueryGroup":
+        if isinstance(field, str):
+            field = SearchField.from_string_field(field)
+        term = self.parent._create_term(field, value, is_exact)
+        self.node.add_child(term)
+        return self
+
+    def add_field_not_match(
+        self, field: Union[str, SearchField], value: str, is_exact: bool = True
+    ) -> "QueryGroup":
+        if isinstance(field, str):
+            field = SearchField.from_string_field(field)
+        term = f"-{self.parent._create_term(field, value, is_exact)}"
+        self.node.add_child(term)
+        return self
+
+    def add_range(
+        self,
+        field: str,
+        min_value: Optional[str] = None,
+        max_value: Optional[str] = None,
+        include_min: bool = True,
+        include_max: bool = True,
+    ) -> "QueryGroup":
+        min_bracket = "[" if include_min else "{"
+        max_bracket = "]" if include_max else "}"
+        min_val = min_value if min_value is not None else "*"
+        max_val = max_value if max_value is not None else "*"
+        range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}"
+        self.node.add_child(range_query)
+        return self
+
+    def add_wildcard(self, field: str, pattern: str) -> "QueryGroup":
+        wildcard_query = f"{field}:{pattern}"
+        self.node.add_child(wildcard_query)
+        return self
+
+    def add_fuzzy(self, field: str, value: str, fuzziness: int = 2) -> "QueryGroup":
+        fuzzy_query = f"{field}:{value}~{fuzziness}"
+        self.node.add_child(fuzzy_query)
+        return self
+
+    def add_boost(self, field: str, value: str, boost: float) -> "QueryGroup":
+        boosted_query = f"{field}:{value}^{boost}"
+        self.node.add_child(boosted_query)
+        return self
+
+    def group(self, operator: LogicalOperator) -> "QueryGroup":
+        new_group = QueryGroup(self.parent, operator)
+        self.node.add_child(new_group.node)
+        return new_group
+
+    def end(self) -> ElasticsearchQueryBuilder:
+        return self.parent
+
+
+SF = TypeVar("SF", bound=SearchField)
+
+
+class ElasticDocumentQuery(Generic[SF]):
+    def __init__(self) -> None:
+        self.query_builder = ElasticsearchQueryBuilder()
+
+    @classmethod
+    def create_from(
+        cls: Type["ElasticDocumentQuery[SF]"],
+        *args: Tuple[Union[str, SF], str],
+    ) -> "ElasticDocumentQuery[SF]":
+        instance = cls()
+        for arg in args:
+            if isinstance(arg, SearchField):
+                # If the value is empty, we treat it as a wildcard search
+                logger.info(f"Adding wildcard search for field {arg}")
+                instance.add_wildcard(arg, "*")
+            elif isinstance(arg, tuple) and len(arg) == 2:
+                field, value = arg
+                assert isinstance(value, str)
+                if isinstance(field, SearchField):
+                    instance.add_field_match(field, value)
+                elif isinstance(field, str):
+                    instance.add_field_match(
+                        SearchField.from_string_field(field), value
+                    )
+                else:
+                    raise ValueError("Invalid field type {}".format(type(field)))
+        return instance
+
+    def add_field_match(
+        self, field: Union[str, SearchField], value: str, is_exact: bool = True
+    ) -> "ElasticDocumentQuery":
+        if isinstance(field, str):
+            field = SearchField.from_string_field(field)
+        self.query_builder.add_field_match(field, value, is_exact)
+        return self
+
+    def add_field_not_match(
+        self, field: SearchField, value: str, is_exact: bool = True
+    ) -> "ElasticDocumentQuery":
+        self.query_builder.add_field_not_match(field, value, is_exact)
+        return self
+
+    def add_range(
+        self,
+        field: SearchField,
+        min_value: Optional[str] = None,
+        max_value: Optional[str] = None,
+        include_min: bool = True,
+        include_max: bool = True,
+    ) -> "ElasticDocumentQuery":
+        field_name: str = field.field_name  # type: ignore
+        self.query_builder.add_range(
+            field_name, min_value, max_value, include_min, include_max
+        )
+        return self
+
+    def add_wildcard(self, field: SearchField, pattern: str) -> "ElasticDocumentQuery":
+        field_name: str = field.field_name  # type: ignore
+        self.query_builder.add_wildcard(field_name, pattern)
+        return self
+
+    def add_fuzzy(
+        self, field: SearchField, value: str, fuzziness: int = 2
+    ) -> "ElasticDocumentQuery":
+        field_name: str = field.field_name  # type: ignore
+        self.query_builder.add_fuzzy(field_name, value, fuzziness)
+        return self
+
+    def add_boost(
+        self, field: SearchField, value: str, boost: float
+    ) -> "ElasticDocumentQuery":
+        self.query_builder.add_boost(field.field_name, value, boost)
+        return self
+
+    def group(self, operator: LogicalOperator) -> QueryGroup:
+        return self.query_builder.group(operator)
+
+    def build(self) -> str:
+        return self.query_builder.build()
diff --git a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py
index e6c9a9466d62b..a84e373dbe72c 100644
--- a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py
+++ b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py
@@ -4,9 +4,12 @@
 
 import datahub.metadata.schema_classes as models
 from datahub.api.entities.platformresource.platform_resource import (
+    ElasticPlatformResourceQuery,
     PlatformResource,
     PlatformResourceKey,
+    PlatformResourceSearchFields,
 )
+from datahub.utilities.search_utils import LogicalOperator
 
 
 def test_platform_resource_dict():
@@ -179,3 +182,15 @@ class TestModel(BaseModel):
     ).encode("utf-8")
     assert platform_resource_info_mcp.aspect.value.schemaType == "JSON"
     assert platform_resource_info_mcp.aspect.value.schemaRef == TestModel.__name__
+
+
+def test_platform_resource_filters():
+
+    query = (
+        ElasticPlatformResourceQuery.create_from()
+        .group(LogicalOperator.AND)
+        .add_field_match(PlatformResourceSearchFields.PRIMARY_KEY, "test_1")
+        .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, "server")
+        .end()
+    )
+    assert query.build() == '(primaryKey:"test_1" AND resourceType:"server")'
diff --git a/metadata-ingestion/tests/unit/utilities/test_search_utils.py b/metadata-ingestion/tests/unit/utilities/test_search_utils.py
new file mode 100644
index 0000000000000..6fa2e46c7f20e
--- /dev/null
+++ b/metadata-ingestion/tests/unit/utilities/test_search_utils.py
@@ -0,0 +1,71 @@
+from datahub.utilities.search_utils import (
+    ElasticDocumentQuery,
+    LogicalOperator,
+    SearchField,
+)
+
+
+def test_simple_and_filters():
+    query = (
+        ElasticDocumentQuery.create_from()
+        .group(LogicalOperator.AND)
+        .add_field_match("field1", "value1")
+        .add_field_match("field2", "value2")
+        .end()
+    )
+
+    assert query.build() == '(field1:"value1" AND field2:"value2")'
+
+
+def test_simple_or_filters():
+    query = (
+        ElasticDocumentQuery.create_from()
+        .group(LogicalOperator.OR)
+        .add_field_match("field1", "value1")
+        .add_field_match("field2", "value2")
+        .end()
+    )
+
+    assert query.build() == '(field1:"value1" OR field2:"value2")'
+
+    # Use SearchFilter to create this query
+    query = (
+        ElasticDocumentQuery.create_from()
+        .group(LogicalOperator.OR)
+        .add_field_match(SearchField.from_string_field("field1"), "value1")
+        .add_field_match(SearchField.from_string_field("field2"), "value2")
+        .end()
+    )
+    assert query.build() == '(field1:"value1" OR field2:"value2")'
+
+
+def test_simple_field_match():
+    query: ElasticDocumentQuery = ElasticDocumentQuery.create_from(
+        ("field1", "value1:1")
+    )
+    assert query.build() == 'field1:"value1\\:1"'
+
+    # Another way to create the same query
+    query = ElasticDocumentQuery.create_from()
+    query.add_field_match("field1", "value1:1")
+    assert query.build() == 'field1:"value1\\:1"'
+
+
+def test_negation():
+    query = (
+        ElasticDocumentQuery.create_from()
+        .group(LogicalOperator.AND)
+        .add_field_match("field1", "value1")
+        .add_field_not_match("field2", "value2")
+        .end()
+    )
+
+    assert query.build() == '(field1:"value1" AND -field2:"value2")'
+
+
+def test_multi_arg_create_from():
+    query: ElasticDocumentQuery = ElasticDocumentQuery.create_from(
+        ("field1", "value1"),
+        ("field2", "value2"),
+    )
+    assert query.build() == '(field1:"value1" AND field2:"value2")'
diff --git a/smoke-test/tests/platform_resources/test_platform_resource.py b/smoke-test/tests/platform_resources/test_platform_resource.py
index 7ebfd4d6ea15b..39d15f2e8dea6 100644
--- a/smoke-test/tests/platform_resources/test_platform_resource.py
+++ b/smoke-test/tests/platform_resources/test_platform_resource.py
@@ -5,8 +5,10 @@
 
 import pytest
 from datahub.api.entities.platformresource.platform_resource import (
+    ElasticPlatformResourceQuery,
     PlatformResource,
     PlatformResourceKey,
+    PlatformResourceSearchFields,
 )
 
 from tests.utils import wait_for_healthcheck_util, wait_for_writes_to_sync
@@ -42,7 +44,12 @@ def cleanup_resources(graph_client):
             logger.warning(f"Failed to delete resource: {e}")
 
     # Additional cleanup for any resources that might have been missed
-    for resource in PlatformResource.search_by_key(graph_client, "test_"):
+    for resource in PlatformResource.search_by_filters(
+        graph_client,
+        ElasticPlatformResourceQuery.create_from().add_wildcard(
+            PlatformResourceSearchFields.PRIMARY_KEY, "test_*"
+        ),
+    ):
         try:
             resource.delete(graph_client)
         except Exception as e:
@@ -114,7 +121,7 @@ def test_platform_resource_non_existent(graph_client, test_id):
     assert platform_resource is None
 
 
-def test_platform_resource_urn_secondary_key(graph_client, test_id):
+def test_platform_resource_urn_secondary_key(graph_client, test_id, cleanup_resources):
     key = PlatformResourceKey(
         platform=f"test_platform_{test_id}",
         resource_type=f"test_resource_type_{test_id}",
@@ -129,6 +136,7 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id):
         secondary_keys=[dataset_urn],
     )
     platform_resource.to_datahub(graph_client)
+    cleanup_resources.append(platform_resource)
     wait_for_writes_to_sync()
 
     read_platform_resources = [
@@ -141,7 +149,9 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id):
     assert read_platform_resources[0] == platform_resource
 
 
-def test_platform_resource_listing_by_resource_type(graph_client, test_id):
+def test_platform_resource_listing_by_resource_type(
+    graph_client, test_id, cleanup_resources
+):
     # Generate two resources with the same resource type
     key1 = PlatformResourceKey(
         platform=f"test_platform_{test_id}",
@@ -171,13 +181,9 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id):
         r
         for r in PlatformResource.search_by_filters(
             graph_client,
-            and_filters=[
-                {
-                    "field": "resourceType",
-                    "condition": "EQUAL",
-                    "value": key1.resource_type,
-                }
-            ],
+            query=ElasticPlatformResourceQuery.create_from(
+                (PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type)
+            ),
         )
     ]
     assert len(search_results) == 2
@@ -186,3 +192,55 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id):
     read_platform_resource_2 = next(r for r in search_results if r.id == key2.id)
     assert read_platform_resource_1 == platform_resource1
     assert read_platform_resource_2 == platform_resource2
+
+
+def test_platform_resource_listing_complex_queries(graph_client, test_id):
+    # Generate two resources with the same resource type
+    key1 = PlatformResourceKey(
+        platform=f"test_platform1_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_1_{test_id}",
+    )
+    platform_resource1 = PlatformResource.create(
+        key=key1,
+        value={"test_key": f"test_value_1_{test_id}"},
+    )
+    platform_resource1.to_datahub(graph_client)
+
+    key2 = PlatformResourceKey(
+        platform=f"test_platform2_{test_id}",
+        resource_type=f"test_resource_type_{test_id}",
+        primary_key=f"test_primary_key_2_{test_id}",
+    )
+    platform_resource2 = PlatformResource.create(
+        key=key2,
+        value={"test_key": f"test_value_2_{test_id}"},
+    )
+    platform_resource2.to_datahub(graph_client)
+
+    wait_for_writes_to_sync()
+    from datahub.api.entities.platformresource.platform_resource import (
+        ElasticPlatformResourceQuery,
+        LogicalOperator,
+        PlatformResourceSearchFields,
+    )
+
+    query = (
+        ElasticPlatformResourceQuery.create_from()
+        .group(LogicalOperator.AND)
+        .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type)
+        .add_field_not_match(PlatformResourceSearchFields.PLATFORM, key1.platform)
+        .end()
+    )
+
+    search_results = [
+        r
+        for r in PlatformResource.search_by_filters(
+            graph_client,
+            query=query,
+        )
+    ]
+    assert len(search_results) == 1
+
+    read_platform_resource = search_results[0]
+    assert read_platform_resource == platform_resource2

From 2f85bc1ecbd025d9ce0c6d3da644b3f83656755c Mon Sep 17 00:00:00 2001
From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com>
Date: Sun, 20 Oct 2024 06:58:32 +0530
Subject: [PATCH 15/17] fix(docs): fix businessattributes doc (#11653)

---
 docs/businessattributes.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/businessattributes.md b/docs/businessattributes.md
index 1744f48f879e8..3e912e7e60980 100644
--- a/docs/businessattributes.md
+++ b/docs/businessattributes.md
@@ -28,7 +28,6 @@ Taking the example of "United States- Social Security Number", if an application
 What you need to create/update and associate business attributes to dataset schema field
 
 * **Manage Business Attributes** platform privilege to create/update/delete business attributes.
-* **Edit Dataset Column Business Attribute** metadata privilege to associate business attributes to dataset schema field.
 
 ## Using Business Attributes
 As of now Business Attributes can only be created through UI

From dd1a06fb558a2177ea02437ea0aa7172c2ccd781 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Sun, 20 Oct 2024 23:59:45 -0700
Subject: [PATCH 16/17] feat(ingest/fivetran): add safeguards on table/column
 lineage (#11674)

---
 .../ingestion/source/fivetran/config.py       |  19 +--
 .../ingestion/source/fivetran/data_classes.py |   2 +-
 .../ingestion/source/fivetran/fivetran.py     |  23 ++-
 .../source/fivetran/fivetran_log_api.py       |  86 +++++------
 .../source/fivetran/fivetran_query.py         | 143 +++++++++++-------
 .../integration/fivetran/test_fivetran.py     |   6 +-
 6 files changed, 156 insertions(+), 123 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py
index 02eb096b240f5..2fb5ffd16ea34 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py
@@ -1,6 +1,6 @@
+import dataclasses
 import logging
-from dataclasses import dataclass, field as dataclass_field
-from typing import Dict, List, Optional
+from typing import Dict, Optional
 
 import pydantic
 from pydantic import Field, root_validator
@@ -23,6 +23,7 @@
 from datahub.ingestion.source.state.stateful_ingestion_base import (
     StatefulIngestionConfigBase,
 )
+from datahub.utilities.lossy_collections import LossyList
 from datahub.utilities.perf_timer import PerfTimer
 
 logger = logging.getLogger(__name__)
@@ -114,24 +115,24 @@ def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
         return values
 
 
-@dataclass
+@dataclasses.dataclass
 class MetadataExtractionPerfReport(Report):
-    connectors_metadata_extraction_sec: PerfTimer = dataclass_field(
+    connectors_metadata_extraction_sec: PerfTimer = dataclasses.field(
         default_factory=PerfTimer
     )
-    connectors_lineage_extraction_sec: PerfTimer = dataclass_field(
+    connectors_lineage_extraction_sec: PerfTimer = dataclasses.field(
         default_factory=PerfTimer
     )
-    connectors_jobs_extraction_sec: PerfTimer = dataclass_field(
+    connectors_jobs_extraction_sec: PerfTimer = dataclasses.field(
         default_factory=PerfTimer
     )
 
 
-@dataclass
+@dataclasses.dataclass
 class FivetranSourceReport(StaleEntityRemovalSourceReport):
     connectors_scanned: int = 0
-    filtered_connectors: List[str] = dataclass_field(default_factory=list)
-    metadata_extraction_perf: MetadataExtractionPerfReport = dataclass_field(
+    filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList)
+    metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field(
         default_factory=MetadataExtractionPerfReport
     )
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
index 18de2b01edd3b..046aa9efe3f59 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
@@ -24,7 +24,7 @@ class Connector:
     sync_frequency: int
     destination_id: str
     user_id: str
-    table_lineage: List[TableLineage]
+    lineage: List[TableLineage]
     jobs: List["Job"]
 
 
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
index 334bb58ea84f8..c27ec57c2e99e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -27,9 +27,10 @@
     PlatformDetail,
 )
 from datahub.ingestion.source.fivetran.data_classes import Connector, Job
-from datahub.ingestion.source.fivetran.fivetran_log_api import (
+from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI
+from datahub.ingestion.source.fivetran.fivetran_query import (
     MAX_JOBS_PER_CONNECTOR,
-    FivetranLogAPI,
+    MAX_TABLE_LINEAGE_PER_CONNECTOR,
 )
 from datahub.ingestion.source.state.stale_entity_removal_handler import (
     StaleEntityRemovalHandler,
@@ -106,13 +107,21 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None:
                 f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity."
             )
 
-        for table_lineage in connector.table_lineage:
+        if len(connector.lineage) >= MAX_TABLE_LINEAGE_PER_CONNECTOR:
+            self.report.warning(
+                title="Table lineage truncated",
+                message=f"The connector had more than {MAX_TABLE_LINEAGE_PER_CONNECTOR} table lineage entries. "
+                f"Only the most recent {MAX_TABLE_LINEAGE_PER_CONNECTOR} entries were ingested.",
+                context=f"{connector.connector_name} (connector_id: {connector.connector_id})",
+            )
+
+        for lineage in connector.lineage:
             input_dataset_urn = DatasetUrn.create_from_ids(
                 platform_id=source_platform,
                 table_name=(
-                    f"{source_database.lower()}.{table_lineage.source_table}"
+                    f"{source_database.lower()}.{lineage.source_table}"
                     if source_database
-                    else table_lineage.source_table
+                    else lineage.source_table
                 ),
                 env=source_platform_detail.env,
                 platform_instance=source_platform_detail.platform_instance,
@@ -121,14 +130,14 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None:
 
             output_dataset_urn = DatasetUrn.create_from_ids(
                 platform_id=self.config.fivetran_log_config.destination_platform,
-                table_name=f"{self.audit_log.fivetran_log_database.lower()}.{table_lineage.destination_table}",
+                table_name=f"{self.audit_log.fivetran_log_database.lower()}.{lineage.destination_table}",
                 env=destination_platform_detail.env,
                 platform_instance=destination_platform_detail.platform_instance,
             )
             output_dataset_urn_list.append(output_dataset_urn)
 
             if self.config.include_column_lineage:
-                for column_lineage in table_lineage.column_lineage:
+                for column_lineage in lineage.column_lineage:
                     fine_grained_lineage.append(
                         FineGrainedLineage(
                             upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
index 5908efe39e2b4..b55c8bbbd607f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
@@ -1,6 +1,7 @@
 import functools
 import json
 import logging
+from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple
 
 import sqlglot
@@ -22,10 +23,6 @@
 
 logger: logging.Logger = logging.getLogger(__name__)
 
-# We don't want to generate a massive number of dataProcesses for a single connector.
-# This is primarily used as a safeguard to prevent performance issues.
-MAX_JOBS_PER_CONNECTOR = 1000
-
 
 class FivetranLogAPI:
     def __init__(self, fivetran_log_config: FivetranLogConfig) -> None:
@@ -91,55 +88,51 @@ def _query(self, query: str) -> List[Dict]:
         resp = self.engine.execute(query)
         return [row for row in resp]
 
-    def _get_column_lineage_metadata(self) -> Dict[str, List]:
+    def _get_column_lineage_metadata(self) -> Dict[Tuple[str, str], List]:
         """
-        Return's dict of column lineage metadata with key as '<SOURCE_TABLE_ID>-<DESTINATION_TABLE_ID>'
+        Returns dict of column lineage metadata with key as (<SOURCE_TABLE_ID>, <DESTINATION_TABLE_ID>)
         """
-        all_column_lineage: Dict[str, List] = {}
+        all_column_lineage = defaultdict(list)
         column_lineage_result = self._query(
             self.fivetran_log_query.get_column_lineage_query()
         )
         for column_lineage in column_lineage_result:
-            key = f"{column_lineage[Constant.SOURCE_TABLE_ID]}-{column_lineage[Constant.DESTINATION_TABLE_ID]}"
-            if key not in all_column_lineage:
-                all_column_lineage[key] = [column_lineage]
-            else:
-                all_column_lineage[key].append(column_lineage)
-        return all_column_lineage
+            key = (
+                column_lineage[Constant.SOURCE_TABLE_ID],
+                column_lineage[Constant.DESTINATION_TABLE_ID],
+            )
+            all_column_lineage[key].append(column_lineage)
+        return dict(all_column_lineage)
 
-    def _get_connectors_table_lineage_metadata(self) -> Dict[str, List]:
+    def _get_table_lineage_metadata(self) -> Dict[str, List]:
         """
-        Return's dict of table lineage metadata with key as 'CONNECTOR_ID'
+        Returns dict of table lineage metadata with key as 'CONNECTOR_ID'
         """
-        connectors_table_lineage_metadata: Dict[str, List] = {}
+        connectors_table_lineage_metadata = defaultdict(list)
         table_lineage_result = self._query(
             self.fivetran_log_query.get_table_lineage_query()
         )
         for table_lineage in table_lineage_result:
-            if (
+            connectors_table_lineage_metadata[
                 table_lineage[Constant.CONNECTOR_ID]
-                not in connectors_table_lineage_metadata
-            ):
-                connectors_table_lineage_metadata[
-                    table_lineage[Constant.CONNECTOR_ID]
-                ] = [table_lineage]
-            else:
-                connectors_table_lineage_metadata[
-                    table_lineage[Constant.CONNECTOR_ID]
-                ].append(table_lineage)
-        return connectors_table_lineage_metadata
+            ].append(table_lineage)
+        return dict(connectors_table_lineage_metadata)
 
-    def _get_table_lineage(
+    def _extract_connector_lineage(
         self,
-        column_lineage_metadata: Dict[str, List],
         table_lineage_result: Optional[List],
+        column_lineage_metadata: Dict[Tuple[str, str], List],
     ) -> List[TableLineage]:
         table_lineage_list: List[TableLineage] = []
         if table_lineage_result is None:
             return table_lineage_list
         for table_lineage in table_lineage_result:
+            # Join the column lineage into the table lineage.
             column_lineage_result = column_lineage_metadata.get(
-                f"{table_lineage[Constant.SOURCE_TABLE_ID]}-{table_lineage[Constant.DESTINATION_TABLE_ID]}"
+                (
+                    table_lineage[Constant.SOURCE_TABLE_ID],
+                    table_lineage[Constant.DESTINATION_TABLE_ID],
+                )
             )
             column_lineage_list: List[ColumnLineage] = []
             if column_lineage_result:
@@ -152,6 +145,7 @@ def _get_table_lineage(
                     )
                     for column_lineage in column_lineage_result
                 ]
+
             table_lineage_list.append(
                 TableLineage(
                     source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}",
@@ -167,14 +161,9 @@ def _get_all_connector_sync_logs(
     ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]:
         sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {}
 
-        # Format connector_ids as a comma-separated string of quoted IDs
-        formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
-
-        query = self.fivetran_log_query.get_sync_logs_query().format(
-            db_clause=self.fivetran_log_query.db_clause,
+        query = self.fivetran_log_query.get_sync_logs_query(
             syncs_interval=syncs_interval,
-            max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR,
-            connector_ids=formatted_connector_ids,
+            connector_ids=connector_ids,
         )
 
         for row in self._query(query):
@@ -234,13 +223,13 @@ def get_user_email(self, user_id: str) -> Optional[str]:
             return None
         return self._get_users().get(user_id)
 
-    def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None:
-        table_lineage_metadata = self._get_connectors_table_lineage_metadata()
+    def _fill_connectors_lineage(self, connectors: List[Connector]) -> None:
+        table_lineage_metadata = self._get_table_lineage_metadata()
         column_lineage_metadata = self._get_column_lineage_metadata()
         for connector in connectors:
-            connector.table_lineage = self._get_table_lineage(
-                column_lineage_metadata=column_lineage_metadata,
+            connector.lineage = self._extract_connector_lineage(
                 table_lineage_result=table_lineage_metadata.get(connector.connector_id),
+                column_lineage_metadata=column_lineage_metadata,
             )
 
     def _fill_connectors_jobs(
@@ -262,6 +251,7 @@ def get_allowed_connectors_list(
     ) -> List[Connector]:
         connectors: List[Connector] = []
         with report.metadata_extraction_perf.connectors_metadata_extraction_sec:
+            logger.info("Fetching connector list")
             connector_list = self._query(self.fivetran_log_query.get_connectors_query())
             for connector in connector_list:
                 if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]):
@@ -279,12 +269,20 @@ def get_allowed_connectors_list(
                         sync_frequency=connector[Constant.SYNC_FREQUENCY],
                         destination_id=connector[Constant.DESTINATION_ID],
                         user_id=connector[Constant.CONNECTING_USER_ID],
-                        table_lineage=[],
-                        jobs=[],
+                        lineage=[],  # filled later
+                        jobs=[],  # filled later
                     )
                 )
+
+        if not connectors:
+            # Some of our queries don't work well when there's no connectors, since
+            # we push down connector id filters.
+            return []
+
         with report.metadata_extraction_perf.connectors_lineage_extraction_sec:
-            self._fill_connectors_table_lineage(connectors)
+            logger.info("Fetching connector lineage")
+            self._fill_connectors_lineage(connectors)
         with report.metadata_extraction_perf.connectors_jobs_extraction_sec:
+            logger.info("Fetching connector job run history")
             self._fill_connectors_jobs(connectors, syncs_interval)
         return connectors
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
index c4680b4b1037a..c9e329b706768 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
@@ -1,3 +1,11 @@
+from typing import List
+
+# Safeguards to prevent fetching massive amounts of data.
+MAX_TABLE_LINEAGE_PER_CONNECTOR = 100
+MAX_COLUMN_LINEAGE_PER_CONNECTOR = 3000
+MAX_JOBS_PER_CONNECTOR = 1000
+
+
 class FivetranLogQuery:
     # Note: All queries are written in Snowflake SQL.
     # They will be transpiled to the target database's SQL dialect at runtime.
@@ -24,69 +32,88 @@ def get_connectors_query(self) -> str:
   destination_id
 FROM {self.db_clause}connector
 WHERE
-  _fivetran_deleted = FALSE\
+  _fivetran_deleted = FALSE
 """
 
     def get_users_query(self) -> str:
-        return f"""
-        SELECT id as user_id,
-        given_name,
-        family_name,
-        email
-        FROM {self.db_clause}user"""
+        return f"""\
+SELECT id as user_id,
+given_name,
+family_name,
+email
+FROM {self.db_clause}user
+"""
 
-    def get_sync_logs_query(self) -> str:
-        return """
-        WITH ranked_syncs AS (
-            SELECT
-                connector_id,
-                sync_id,
-                MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
-                MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
-                MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
-                ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
-            FROM {db_clause}log
-            WHERE message_event in ('sync_start', 'sync_end')
-            AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
-            AND connector_id IN ({connector_ids})
-            GROUP BY connector_id, sync_id
-        )
-        SELECT
-            connector_id,
-            sync_id,
-            start_time,
-            end_time,
-            end_message_data
-        FROM ranked_syncs
-        WHERE rn <= {max_jobs_per_connector}
-          AND start_time IS NOT NULL
-          AND end_time IS NOT NULL
-        ORDER BY connector_id, end_time DESC
-        """
+    def get_sync_logs_query(
+        self,
+        syncs_interval: int,
+        connector_ids: List[str],
+    ) -> str:
+        # Format connector_ids as a comma-separated string of quoted IDs
+        formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids)
+
+        return f"""\
+WITH ranked_syncs AS (
+    SELECT
+        connector_id,
+        sync_id,
+        MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time,
+        MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time,
+        MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data,
+        ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn
+    FROM {self.db_clause}log
+    WHERE message_event in ('sync_start', 'sync_end')
+    AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'
+    AND connector_id IN ({formatted_connector_ids})
+    GROUP BY connector_id, sync_id
+)
+SELECT
+    connector_id,
+    sync_id,
+    start_time,
+    end_time,
+    end_message_data
+FROM ranked_syncs
+WHERE rn <= {MAX_JOBS_PER_CONNECTOR}
+    AND start_time IS NOT NULL
+    AND end_time IS NOT NULL
+ORDER BY connector_id, end_time DESC
+"""
 
     def get_table_lineage_query(self) -> str:
-        return f"""
-        SELECT stm.connector_id as connector_id,
-        stm.id as source_table_id,
-        stm.name as source_table_name,
-        ssm.name as source_schema_name,
-        dtm.id as destination_table_id,
-        dtm.name as destination_table_name,
-        dsm.name as destination_schema_name
-        FROM {self.db_clause}table_lineage as tl
-        JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id
-        JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
-        JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
-        JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id"""
+        return f"""\
+SELECT
+    stm.connector_id as connector_id,
+    stm.id as source_table_id,
+    stm.name as source_table_name,
+    ssm.name as source_schema_name,
+    dtm.id as destination_table_id,
+    dtm.name as destination_table_name,
+    dsm.name as destination_schema_name
+FROM {self.db_clause}table_lineage as tl
+JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id
+JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id
+JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id
+JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id
+QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY tl.created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR}
+ORDER BY stm.connector_id, tl.created_at DESC
+"""
 
     def get_column_lineage_query(self) -> str:
-        return f"""
-        SELECT scm.table_id as source_table_id,
-        dcm.table_id as destination_table_id,
-        scm.name as source_column_name,
-        dcm.name as destination_column_name
-        FROM {self.db_clause}column_lineage as cl
-        JOIN {self.db_clause}source_column_metadata as scm
-        on cl.source_column_id = scm.id
-        JOIN {self.db_clause}destination_column_metadata as dcm
-        on cl.destination_column_id = dcm.id"""
+        return f"""\
+SELECT
+    scm.table_id as source_table_id,
+    dcm.table_id as destination_table_id,
+    scm.name as source_column_name,
+    dcm.name as destination_column_name
+FROM {self.db_clause}column_lineage as cl
+JOIN {self.db_clause}source_column_metadata as scm
+  ON cl.source_column_id = scm.id
+JOIN {self.db_clause}destination_column_metadata as dcm
+  ON cl.destination_column_id = dcm.id
+-- Only joining source_table_metadata to get the connector_id.
+JOIN {self.db_clause}source_table_metadata as stm
+  ON scm.table_id = stm.id
+QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY cl.created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR}
+ORDER BY stm.connector_id, cl.created_at DESC
+"""
diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
index 33ac09e69a3c0..e72162b12e48f 100644
--- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
+++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
@@ -100,11 +100,9 @@ def default_query_results(
                 "email": "abc.xyz@email.com",
             }
         ]
-    elif query == fivetran_log_query.get_sync_logs_query().format(
-        db_clause=fivetran_log_query.db_clause,
+    elif query == fivetran_log_query.get_sync_logs_query(
         syncs_interval=7,
-        max_jobs_per_connector=1000,
-        connector_ids="'calendar_elected'",
+        connector_ids=["calendar_elected"],
     ):
         return [
             {

From 554288bb05354dae16471381daec3549658bdda3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Felix=20L=C3=BCdin?=
 <13187726+Masterchen09@users.noreply.github.com>
Date: Mon, 21 Oct 2024 09:00:09 +0200
Subject: [PATCH 17/17] fix(ui): show DataHub logo for DataHub sources in
 ingestion souces list (#11658)

Co-authored-by: Shirshanka Das <shirshanka@apache.org>
---
 .../src/app/ingest/source/builder/constants.ts        | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts
index b67ca388c1054..0e0ba8b22e37e 100644
--- a/datahub-web-react/src/app/ingest/source/builder/constants.ts
+++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts
@@ -35,6 +35,7 @@ import csvLogo from '../../../../images/csv-logo.png';
 import qlikLogo from '../../../../images/qliklogo.png';
 import sigmaLogo from '../../../../images/sigmalogo.png';
 import sacLogo from '../../../../images/saclogo.svg';
+import datahubLogo from '../../../../images/datahublogo.png';
 
 export const ATHENA = 'athena';
 export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`;
@@ -125,6 +126,11 @@ export const SIGMA = 'sigma';
 export const SIGMA_URN = `urn:li:dataPlatform:${SIGMA}`;
 export const SAC = 'sac';
 export const SAC_URN = `urn:li:dataPlatform:${SAC}`;
+export const DATAHUB = 'datahub';
+export const DATAHUB_GC = 'datahub-gc';
+export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file';
+export const DATAHUB_BUSINESS_GLOSSARY = 'datahub-business-glossary';
+export const DATAHUB_URN = `urn:li:dataPlatform:${DATAHUB}`;
 
 export const PLATFORM_URN_TO_LOGO = {
     [ATHENA_URN]: athenaLogo,
@@ -165,6 +171,7 @@ export const PLATFORM_URN_TO_LOGO = {
     [QLIK_SENSE_URN]: qlikLogo,
     [SIGMA_URN]: sigmaLogo,
     [SAC_URN]: sacLogo,
+    [DATAHUB_URN]: datahubLogo,
 };
 
 export const SOURCE_TO_PLATFORM_URN = {
@@ -178,5 +185,7 @@ export const SOURCE_TO_PLATFORM_URN = {
     [SNOWFLAKE_USAGE]: SNOWFLAKE_URN,
     [STARBURST_TRINO_USAGE]: TRINO_URN,
     [DBT_CLOUD]: DBT_URN,
-    [VERTICA]: VERTICA_URN,
+    [DATAHUB_GC]: DATAHUB_URN,
+    [DATAHUB_LINEAGE_FILE]: DATAHUB_URN,
+    [DATAHUB_BUSINESS_GLOSSARY]: DATAHUB_URN,
 };