From cf3634a6a9233b98d45560beb8cbabf6f17ff166 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Thu, 17 Oct 2024 19:57:02 +0200 Subject: [PATCH 01/17] fix(ingest/bigquery): Not setting platform instance for bigquery platform resources (#11659) --- .../ingestion/source/bigquery_v2/bigquery.py | 3 - .../bigquery_platform_resource_helper.py | 2 +- .../bigquery_v2/bigquery_mcp_golden.json | 90 +++++++++---------- 3 files changed, 45 insertions(+), 50 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index c30dade921d25..3c6202cc7cbfa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -65,9 +65,6 @@ # We can't use close as it is not called if the ingestion is not successful def cleanup(config: BigQueryV2Config) -> None: if config._credentials_path is not None: - logger.debug( - f"Deleting temporary credential file at {config._credentials_path}" - ) os.unlink(config._credentials_path) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py index d2da895be985d..9da2aceb19220 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_platform_resource_helper.py @@ -42,7 +42,7 @@ def platform_resource_key(self) -> PlatformResourceKey: return PlatformResourceKey( platform="bigquery", resource_type="BigQueryLabelInfo", - platform_instance=self.project, + platform_instance=None, primary_key=self.label.primary_key(), ) diff --git a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json index b268926f155b7..640ee1bf436b0 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json +++ b/metadata-ingestion/tests/integration/bigquery_v2/bigquery_mcp_golden.json @@ -201,7 +201,7 @@ }, { "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2", + "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b", "changeType": "UPSERT", "aspectName": "platformResourceInfo", "aspect": { @@ -221,30 +221,29 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", "lastRunId": "no-run-id-provided" } }, { "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2", + "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:bigquery", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + "platform": "urn:li:dataPlatform:bigquery" } }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:11f75a60f0dbd414676852e46a45e39b", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -254,13 +253,13 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", "lastRunId": "no-run-id-provided" } }, { "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3", + "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916", "changeType": "UPSERT", "aspectName": "platformResourceInfo", "aspect": { @@ -280,19 +279,50 @@ }, "systemMetadata": { "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", "lastRunId": "no-run-id-provided" } }, { "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3", + "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916", "changeType": "UPSERT", "aspectName": "dataPlatformInstance", "aspect": { "json": { - "platform": "urn:li:dataPlatform:bigquery", - "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:bigquery,project-id-1)" + "platform": "urn:li:dataPlatform:bigquery" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "platformResource", + "entityUrn": "urn:li:platformResource:99b34051bd90d28d922b0e107277a916", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "bigquery-2022_02_03-07_00_00-2j2qqv", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false } }, "systemMetadata": { @@ -395,38 +425,6 @@ "lastRunId": "no-run-id-provided" } }, -{ - "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:79d443a7956814fdab2168e11392bbf2", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", - "lastRunId": "no-run-id-provided" - } -}, -{ - "entityType": "platformResource", - "entityUrn": "urn:li:platformResource:0a8c87e84bd90486c4fd57bbae6557e3", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1643871600000, - "runId": "bigquery-2022_02_03-07_00_00", - "lastRunId": "no-run-id-provided" - } -}, { "entityType": "dataset", "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:bigquery,project-id-1.bigquery-dataset-1.table-1,PROD)", From 6f193221c957b4adcc0f15b1e0af0679c1665272 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 11:50:42 -0700 Subject: [PATCH 02/17] fix(ingest/dbt): fix bug in CLL pruning (#11614) --- .../src/datahub/ingestion/source/dbt/dbt_common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index c15f1deb43a3a..4cd3c934ce634 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -1074,8 +1074,8 @@ def add_node_to_cll_list(dbt_name: str) -> None: for upstream in all_nodes_map[dbt_name].upstream_nodes: schema_nodes.add(upstream) - upstream_node = all_nodes_map[upstream] - if upstream_node.is_ephemeral_model(): + upstream_node = all_nodes_map.get(upstream) + if upstream_node and upstream_node.is_ephemeral_model(): add_node_to_cll_list(upstream) cll_nodes.add(dbt_name) From 523a456f5c9f166c5e99a9cdb33d9b421e199505 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 12:33:04 -0700 Subject: [PATCH 03/17] fix(ingest/redshift): fix syntax error in temp sql (#11661) --- .../src/datahub/ingestion/source/redshift/exception.py | 8 ++++---- .../src/datahub/ingestion/source/redshift/query.py | 5 ++--- .../tests/unit/redshift/redshift_query_mocker.py | 2 +- 3 files changed, 7 insertions(+), 8 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py index 43ad5bfcefdf1..ed0856fc1e292 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py @@ -40,25 +40,25 @@ def report_redshift_failure( error_message = str(e).lower() if "permission denied" in error_message: if "svv_table_info" in error_message: - report.report_failure( + report.failure( title="Permission denied", message="Failed to extract metadata due to insufficient permission to access 'svv_table_info' table. Please ensure the provided database user has access.", exc=e, ) elif "svl_user_info" in error_message: - report.report_failure( + report.failure( title="Permission denied", message="Failed to extract metadata due to insufficient permission to access 'svl_user_info' table. Please ensure the provided database user has access.", exc=e, ) else: - report.report_failure( + report.failure( title="Permission denied", message="Failed to extract metadata due to insufficient permissions.", exc=e, ) else: - report.report_failure( + report.failure( title="Failed to extract some metadata", message="Failed to extract some metadata from Redshift.", exc=e, diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index f7fad574f7fbe..b18b526ef30fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -569,8 +569,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: end_time_str: str = end_time.strftime(redshift_datetime_format) - return rf"""\ --- DataHub Redshift Source temp table DDL query + return rf"""-- DataHub Redshift Source temp table DDL query select * from ( @@ -645,7 +644,7 @@ def temp_table_ddl_query(start_time: datetime, end_time: datetime) -> str: ) where rn = 1 - """ +""" # Add this join to the sql query for more metrics on completed queries # LEFT JOIN svl_query_metrics_summary sqms ON ss.query = sqms.query diff --git a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py index 06b592d42914b..10c0250e37e37 100644 --- a/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py +++ b/metadata-ingestion/tests/unit/redshift/redshift_query_mocker.py @@ -56,7 +56,7 @@ def mock_stl_insert_table_cursor(cursor: MagicMock) -> None: query_vs_cursor_mocker = { ( - "\\\n-- DataHub Redshift Source temp table DDL query\nselect\n *\nfrom (\n select\n session_id,\n transaction_id,\n start_time,\n userid,\n REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n query_text,\n row_number() over (\n partition by session_id, TRIM(query_text)\n order by start_time desc\n ) rn\n from (\n select\n pid as session_id,\n xid as transaction_id,\n starttime as start_time,\n type,\n query_text,\n userid\n from (\n select\n starttime,\n pid,\n xid,\n type,\n userid,\n LISTAGG(case\n when LEN(RTRIM(text)) = 0 then text\n else RTRIM(text)\n end,\n '') within group (\n order by sequence\n ) as query_text\n from\n SVL_STATEMENTTEXT\n where\n type in ('DDL', 'QUERY')\n AND starttime >= '2024-01-01 12:00:00'\n AND starttime < '2024-01-10 12:00:00'\n AND sequence < 290\n group by\n starttime,\n pid,\n xid,\n type,\n userid\n order by\n starttime,\n pid,\n xid,\n type,\n userid\n asc\n )\n where\n type in ('DDL', 'QUERY')\n )\n where\n (create_command ilike 'create temp table %'\n or create_command ilike 'create temporary table %'\n -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n or create_command ilike 'create table %')\n -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n and create_command not like 'CREATE TEMP TABLE volt_tt_'\n -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n rn = 1\n " + "-- DataHub Redshift Source temp table DDL query\nselect\n *\nfrom (\n select\n session_id,\n transaction_id,\n start_time,\n userid,\n REGEXP_REPLACE(REGEXP_SUBSTR(REGEXP_REPLACE(query_text,'\\\\\\\\n','\\\\n'), '(CREATE(?:[\\\\n\\\\s\\\\t]+(?:temp|temporary))?(?:[\\\\n\\\\s\\\\t]+)table(?:[\\\\n\\\\s\\\\t]+)[^\\\\n\\\\s\\\\t()-]+)', 0, 1, 'ipe'),'[\\\\n\\\\s\\\\t]+',' ',1,'p') as create_command,\n query_text,\n row_number() over (\n partition by session_id, TRIM(query_text)\n order by start_time desc\n ) rn\n from (\n select\n pid as session_id,\n xid as transaction_id,\n starttime as start_time,\n type,\n query_text,\n userid\n from (\n select\n starttime,\n pid,\n xid,\n type,\n userid,\n LISTAGG(case\n when LEN(RTRIM(text)) = 0 then text\n else RTRIM(text)\n end,\n '') within group (\n order by sequence\n ) as query_text\n from\n SVL_STATEMENTTEXT\n where\n type in ('DDL', 'QUERY')\n AND starttime >= '2024-01-01 12:00:00'\n AND starttime < '2024-01-10 12:00:00'\n AND sequence < 290\n group by\n starttime,\n pid,\n xid,\n type,\n userid\n order by\n starttime,\n pid,\n xid,\n type,\n userid\n asc\n )\n where\n type in ('DDL', 'QUERY')\n )\n where\n (create_command ilike 'create temp table %'\n or create_command ilike 'create temporary table %'\n -- we want to get all the create table statements and not just temp tables if non temp table is created and dropped in the same transaction\n or create_command ilike 'create table %')\n -- Redshift creates temp tables with the following names: volt_tt_%. We need to filter them out.\n and query_text not ilike 'CREATE TEMP TABLE volt_tt_%'\n and create_command not like 'CREATE TEMP TABLE volt_tt_'\n -- We need to filter out our query and it was not possible earlier when we did not have any comment in the query\n and query_text not ilike '%https://stackoverflow.com/questions/72770890/redshift-result-size-exceeds-listagg-limit-on-svl-statementtext%'\n\n)\nwhere\n rn = 1\n" ): mock_temp_table_cursor, "select * from test_collapse_temp_lineage": mock_stl_insert_table_cursor, } From d7ccf3b2c34fa53085678ab635c733edf8d9fc31 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 16:36:18 -0700 Subject: [PATCH 04/17] docs(airflow): add known limitations for automatic lineage (#11652) --- docs/lineage/airflow.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 35f2ff862e695..829c048a8f8e2 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -132,7 +132,7 @@ conn_id = datahub_rest_default # or datahub_kafka_default ``` | Name | Default value | Description | -|----------------------------|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| -------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | enabled | true | If the plugin should be enabled. | | conn_id | datahub_rest_default | The name of the datahub connection you set in step 1. | | cluster | prod | name of the airflow cluster | @@ -191,6 +191,10 @@ These operators are supported by OpenLineage, but we haven't tested them yet: There's also a few operators (e.g. BashOperator, PythonOperator) that have custom extractors, but those extractors don't generate lineage. --> +Known limitations: + +- We do not fully support operators that run multiple SQL statements at once. In these cases, we'll only capture lineage from the first SQL statement. + ## Manual Lineage Annotation ### Using `inlets` and `outlets` From 6e3724b2daaeeebd1895fc43fda4956827112af3 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 17 Oct 2024 17:50:59 -0700 Subject: [PATCH 05/17] perf(ingest): streamline CLL generation (#11645) --- metadata-ingestion/setup.py | 2 +- .../ingestion/source/redshift/lineage.py | 2 +- .../ingestion/source/tableau/tableau.py | 2 +- .../datahub/sql_parsing/sqlglot_lineage.py | 51 ++++++++++------- .../tableau/test_tableau_ingest.py | 17 ++++-- ...st_bigquery_subquery_column_inference.json | 57 +++++++++++++++++++ .../unit/sql_parsing/test_sqlglot_lineage.py | 15 +++++ 7 files changed, 118 insertions(+), 28 deletions(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index bfec2c00cb864..365da21208ecc 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -101,7 +101,7 @@ sqlglot_lib = { # Using an Acryl fork of sqlglot. # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:main?expand=1 - "acryl-sqlglot[rs]==25.20.2.dev6", + "acryl-sqlglot[rs]==25.25.2.dev9", } classification_lib = { diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py index 0e4cb6f1599e5..fe491ccb31850 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py @@ -133,7 +133,7 @@ def parse_alter_table_rename(default_schema: str, query: str) -> Tuple[str, str, assert isinstance(parsed_query, sqlglot.exp.Alter) prev_name = parsed_query.this.name rename_clause = parsed_query.args["actions"][0] - assert isinstance(rename_clause, sqlglot.exp.RenameTable) + assert isinstance(rename_clause, sqlglot.exp.AlterRename) new_name = rename_clause.this.name schema = parsed_query.this.db or default_schema diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py index 9f011790990ec..2c17a5a322f05 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau/tableau.py @@ -2131,7 +2131,7 @@ def _create_lineage_from_unsupported_csql( fine_grained_lineages: List[FineGrainedLineage] = [] if self.config.extract_column_level_lineage: - logger.info("Extracting CLL from custom sql") + logger.debug("Extracting CLL from custom sql") fine_grained_lineages = make_fine_grained_lineage_class( parsed_result, csql_urn, out_columns ) diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index 273e9d0f9f0b1..6a7ff5be6d1ea 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -1,6 +1,5 @@ import dataclasses import functools -import itertools import logging import traceback from collections import defaultdict @@ -14,6 +13,8 @@ import sqlglot.optimizer.annotate_types import sqlglot.optimizer.optimizer import sqlglot.optimizer.qualify +import sqlglot.optimizer.qualify_columns +import sqlglot.optimizer.unnest_subqueries from datahub.cli.env_utils import get_boolean_env_variable from datahub.ingestion.graph.client import DataHubGraph @@ -63,24 +64,30 @@ SQL_LINEAGE_TIMEOUT_SECONDS = 10 -RULES_BEFORE_TYPE_ANNOTATION: tuple = tuple( - filter( - lambda func: func.__name__ - not in { - # Skip pushdown_predicates because it sometimes throws exceptions, and we - # don't actually need it for anything. - "pushdown_predicates", - # Skip normalize because it can sometimes be expensive. - "normalize", - }, - itertools.takewhile( - lambda func: func != sqlglot.optimizer.annotate_types.annotate_types, - sqlglot.optimizer.optimizer.RULES, - ), - ) +# These rules are a subset of the rules in sqlglot.optimizer.optimizer.RULES. +# If there's a change in their rules, we probably need to re-evaluate our list as well. +assert len(sqlglot.optimizer.optimizer.RULES) == 14 + +_OPTIMIZE_RULES = ( + sqlglot.optimizer.optimizer.qualify, + # We need to enable this in order for annotate types to work. + sqlglot.optimizer.optimizer.pushdown_projections, + # sqlglot.optimizer.optimizer.normalize, # causes perf issues + sqlglot.optimizer.optimizer.unnest_subqueries, + # sqlglot.optimizer.optimizer.pushdown_predicates, # causes perf issues + # sqlglot.optimizer.optimizer.optimize_joins, + # sqlglot.optimizer.optimizer.eliminate_subqueries, + # sqlglot.optimizer.optimizer.merge_subqueries, + # sqlglot.optimizer.optimizer.eliminate_joins, + # sqlglot.optimizer.optimizer.eliminate_ctes, + sqlglot.optimizer.optimizer.quote_identifiers, + # These three are run separately or not run at all. + # sqlglot.optimizer.optimizer.annotate_types, + # sqlglot.optimizer.canonicalize.canonicalize, + # sqlglot.optimizer.simplify.simplify, ) -# Quick check that the rules were loaded correctly. -assert 0 < len(RULES_BEFORE_TYPE_ANNOTATION) < len(sqlglot.optimizer.optimizer.RULES) + +_DEBUG_TYPE_ANNOTATIONS = False class _ColumnRef(_FrozenModel): @@ -385,11 +392,12 @@ def _sqlglot_force_column_normalizer( schema=sqlglot_db_schema, qualify_columns=True, validate_qualify_columns=False, + allow_partial_qualification=True, identify=True, # sqlglot calls the db -> schema -> table hierarchy "catalog", "db", "table". catalog=default_db, db=default_schema, - rules=RULES_BEFORE_TYPE_ANNOTATION, + rules=_OPTIMIZE_RULES, ) except (sqlglot.errors.OptimizeError, ValueError) as e: raise SqlUnderstandingError( @@ -408,6 +416,10 @@ def _sqlglot_force_column_normalizer( except (sqlglot.errors.OptimizeError, sqlglot.errors.ParseError) as e: # This is not a fatal error, so we can continue. logger.debug("sqlglot failed to annotate or parse types: %s", e) + if _DEBUG_TYPE_ANNOTATIONS and logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Type annotated sql %s", statement.sql(pretty=True, dialect=dialect) + ) return statement, _ColumnResolver( sqlglot_db_schema=sqlglot_db_schema, @@ -907,6 +919,7 @@ def _sqlglot_lineage_inner( # At this stage we only want to qualify the table names. The columns will be dealt with later. qualify_columns=False, validate_qualify_columns=False, + allow_partial_qualification=True, # Only insert quotes where necessary. identify=False, ) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 5a5552a78c56f..3798df07000c8 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -19,6 +19,7 @@ from datahub.configuration.source_common import DEFAULT_ENV from datahub.emitter.mce_builder import make_schema_field_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.run.pipeline import Pipeline, PipelineContext from datahub.ingestion.source.tableau.tableau import ( TableauConfig, @@ -37,7 +38,7 @@ FineGrainedLineageUpstreamType, UpstreamLineage, ) -from datahub.metadata.schema_classes import MetadataChangeProposalClass, UpstreamClass +from datahub.metadata.schema_classes import UpstreamClass from tests.test_helpers import mce_helpers, test_connection_helpers from tests.test_helpers.state_helpers import ( get_current_checkpoint_from_pipeline, @@ -939,11 +940,12 @@ def test_tableau_unsupported_csql(): database_override_map={"production database": "prod"} ) - def test_lineage_metadata( + def check_lineage_metadata( lineage, expected_entity_urn, expected_upstream_table, expected_cll ): - mcp = cast(MetadataChangeProposalClass, next(iter(lineage)).metadata) - assert mcp.aspect == UpstreamLineage( + mcp = cast(MetadataChangeProposalWrapper, list(lineage)[0].metadata) + + expected = UpstreamLineage( upstreams=[ UpstreamClass( dataset=expected_upstream_table, @@ -966,6 +968,9 @@ def test_lineage_metadata( ) assert mcp.entityUrn == expected_entity_urn + actual_aspect = mcp.aspect + assert actual_aspect == expected + csql_urn = "urn:li:dataset:(urn:li:dataPlatform:tableau,09988088-05ad-173c-a2f1-f33ba3a13d1a,PROD)" expected_upstream_table = "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.UserDetail,PROD)" expected_cll = { @@ -996,7 +1001,7 @@ def test_lineage_metadata( }, out_columns=[], ) - test_lineage_metadata( + check_lineage_metadata( lineage=lineage, expected_entity_urn=csql_urn, expected_upstream_table=expected_upstream_table, @@ -1014,7 +1019,7 @@ def test_lineage_metadata( }, out_columns=[], ) - test_lineage_metadata( + check_lineage_metadata( lineage=lineage, expected_entity_urn=csql_urn, expected_upstream_table=expected_upstream_table, diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json new file mode 100644 index 0000000000000..32b3830bf1412 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_subquery_column_inference.json @@ -0,0 +1,57 @@ +{ + "query_type": "SELECT", + "query_type_props": {}, + "query_fingerprint": "4094ebd230c1d47c7e6879b05ab927e550923b1986eb58c5f3814396cf401d18", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "user_id", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)", + "column": "user_id" + } + ] + }, + { + "downstream": { + "table": null, + "column": "source", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)", + "column": "source" + } + ] + }, + { + "downstream": { + "table": null, + "column": "user_source", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,invent_dw.UserDetail,PROD)", + "column": "user_source" + } + ] + } + ], + "debug_info": { + "confidence": 0.2, + "generalized_statement": "SELECT user_id, source, user_source FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY __partition_day DESC) AS rank_ FROM invent_dw.UserDetail) AS source_user WHERE rank_ = ?" + } +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index fb1d2a0bc5011..90cc863d6bd23 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -1253,3 +1253,18 @@ def test_snowflake_drop_schema() -> None: dialect="snowflake", expected_file=RESOURCE_DIR / "test_snowflake_drop_schema.json", ) + + +def test_bigquery_subquery_column_inference() -> None: + assert_sql_result( + """\ +SELECT user_id, source, user_source +FROM ( + SELECT *, ROW_NUMBER() OVER (PARTITION BY user_id ORDER BY __partition_day DESC) AS rank_ + FROM invent_dw.UserDetail +) source_user +WHERE rank_ = 1 +""", + dialect="bigquery", + expected_file=RESOURCE_DIR / "test_bigquery_subquery_column_inference.json", + ) From 179a6714a619a4990c9845eb1997649d94ab53cb Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 18 Oct 2024 10:33:28 +0530 Subject: [PATCH 06/17] feat(ingest): ensure sqlite file delete on clean exit (#11612) --- .../ingestion/source/bigquery_v2/bigquery.py | 9 +- .../source/bigquery_v2/bigquery_queries.py | 4 + .../source/bigquery_v2/queries_extractor.py | 18 +++- .../ingestion/source/redshift/lineage_v2.py | 7 +- .../ingestion/source/redshift/redshift.py | 23 +++-- .../source/snowflake/snowflake_connection.py | 3 +- .../source/snowflake/snowflake_queries.py | 64 ++++++++----- .../source/snowflake/snowflake_v2.py | 94 ++++++++++--------- .../sql_parsing/sql_parsing_aggregator.py | 7 ++ .../bigquery_v2/test_bigquery_queries.py | 16 ++++ .../snowflake/test_snowflake_queries.py | 24 +++++ .../unit/sql_parsing/test_sql_aggregator.py | 22 +++++ 12 files changed, 203 insertions(+), 88 deletions(-) create mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 3c6202cc7cbfa..0e986acc81add 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -272,7 +272,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.set_ingestion_stage("*", QUERIES_EXTRACTION) - queries_extractor = BigQueryQueriesExtractor( + with BigQueryQueriesExtractor( connection=self.config.get_bigquery_client(), schema_api=self.bq_schema_extractor.schema_api, config=BigQueryQueriesExtractorConfig( @@ -288,9 +288,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: identifiers=self.identifiers, schema_resolver=self.sql_parser_schema_resolver, discovered_tables=self.bq_schema_extractor.table_refs, - ) - self.report.queries_extractor = queries_extractor.report - yield from queries_extractor.get_workunits_internal() + ) as queries_extractor: + self.report.queries_extractor = queries_extractor.report + yield from queries_extractor.get_workunits_internal() + else: if self.config.include_usage_statistics: yield from self.usage_extractor.get_usage_workunits( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py index ed27aae19ce96..47f21c9f32353 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_queries.py @@ -88,3 +88,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: def get_report(self) -> BigQueryQueriesSourceReport: return self.report + + def close(self) -> None: + self.queries_extractor.close() + self.connection.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index b4a443673b9a9..afaaaf51964f8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -13,6 +13,7 @@ BaseTimeWindowConfig, get_time_bucket, ) +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.source import SourceReport from datahub.ingestion.api.source_helpers import auto_workunit from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -114,7 +115,7 @@ class BigQueryQueriesExtractorConfig(BigQueryBaseConfig): ) -class BigQueryQueriesExtractor: +class BigQueryQueriesExtractor(Closeable): """ Extracts query audit log and generates usage/lineage/operation workunits. @@ -181,6 +182,7 @@ def __init__( is_allowed_table=self.is_allowed_table, format_queries=False, ) + self.report.sql_aggregator = self.aggregator.report self.report.num_discovered_tables = ( len(self.discovered_tables) if self.discovered_tables else None @@ -273,12 +275,14 @@ def get_workunits_internal( self.report.num_unique_queries = len(queries_deduped) logger.info(f"Found {self.report.num_unique_queries} unique queries") - with self.report.audit_log_load_timer: + with self.report.audit_log_load_timer, queries_deduped: i = 0 for _, query_instances in queries_deduped.items(): for query in query_instances.values(): if i > 0 and i % 10000 == 0: - logger.info(f"Added {i} query log entries to SQL aggregator") + logger.info( + f"Added {i} query log equeries_dedupedntries to SQL aggregator" + ) if self.report.sql_aggregator: logger.info(self.report.sql_aggregator.as_string()) @@ -287,6 +291,11 @@ def get_workunits_internal( yield from auto_workunit(self.aggregator.gen_metadata()) + if not use_cached_audit_log: + queries.close() + shared_connection.close() + audit_log_file.unlink(missing_ok=True) + def deduplicate_queries( self, queries: FileBackedList[ObservedQuery] ) -> FileBackedDict[Dict[int, ObservedQuery]]: @@ -404,6 +413,9 @@ def _parse_audit_log_row(self, row: BigQueryJob) -> ObservedQuery: return entry + def close(self) -> None: + self.aggregator.close() + def _extract_query_text(row: BigQueryJob) -> str: # We wrap select statements in a CTE to make them parseable as DML statement. diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py index 4df64c80bad8a..53f9383ec02a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py @@ -5,6 +5,7 @@ import redshift_connector from datahub.emitter import mce_builder +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig @@ -39,7 +40,7 @@ logger = logging.getLogger(__name__) -class RedshiftSqlLineageV2: +class RedshiftSqlLineageV2(Closeable): # does lineage and usage based on SQL parsing. def __init__( @@ -56,6 +57,7 @@ def __init__( self.context = context self.database = database + self.aggregator = SqlParsingAggregator( platform=self.platform, platform_instance=self.config.platform_instance, @@ -436,3 +438,6 @@ def generate(self) -> Iterable[MetadataWorkUnit]: message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.", context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}", ) + + def close(self) -> None: + self.aggregator.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index a9fc9ab8f3e99..76030cea98494 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -451,24 +451,23 @@ def _extract_metadata( ) if self.config.use_lineage_v2: - lineage_extractor = RedshiftSqlLineageV2( + with RedshiftSqlLineageV2( config=self.config, report=self.report, context=self.ctx, database=database, redundant_run_skip_handler=self.redundant_lineage_run_skip_handler, - ) - - yield from lineage_extractor.aggregator.register_schemas_from_stream( - self.process_schemas(connection, database) - ) + ) as lineage_extractor: + yield from lineage_extractor.aggregator.register_schemas_from_stream( + self.process_schemas(connection, database) + ) - self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION) - yield from self.extract_lineage_v2( - connection=connection, - database=database, - lineage_extractor=lineage_extractor, - ) + self.report.report_ingestion_stage_start(LINEAGE_EXTRACTION) + yield from self.extract_lineage_v2( + connection=connection, + database=database, + lineage_extractor=lineage_extractor, + ) all_tables = self.get_all_tables() else: diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py index d39e95a884dbc..a9f454cfd3cdb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_connection.py @@ -18,6 +18,7 @@ from datahub.configuration.connection_resolver import auto_connection_resolver from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.source.snowflake.constants import ( CLIENT_PREFETCH_THREADS, CLIENT_SESSION_KEEP_ALIVE, @@ -364,7 +365,7 @@ def get_connection(self) -> "SnowflakeConnection": ) from e -class SnowflakeConnection: +class SnowflakeConnection(Closeable): _connection: NativeSnowflakeConnection def __init__(self, connection: NativeSnowflakeConnection): diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py index 1445d02aa49db..e11073d77b46e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_queries.py @@ -1,3 +1,4 @@ +import contextlib import dataclasses import functools import json @@ -17,6 +18,7 @@ BaseTimeWindowConfig, BucketDuration, ) +from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.report import Report from datahub.ingestion.api.source import Source, SourceReport @@ -121,7 +123,7 @@ class SnowflakeQueriesSourceReport(SourceReport): queries_extractor: Optional[SnowflakeQueriesExtractorReport] = None -class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin): +class SnowflakeQueriesExtractor(SnowflakeStructuredReportMixin, Closeable): def __init__( self, connection: SnowflakeConnection, @@ -143,28 +145,33 @@ def __init__( self._structured_report = structured_report - self.aggregator = SqlParsingAggregator( - platform=self.identifiers.platform, - platform_instance=self.identifiers.identifier_config.platform_instance, - env=self.identifiers.identifier_config.env, - schema_resolver=schema_resolver, - graph=graph, - eager_graph_load=False, - generate_lineage=self.config.include_lineage, - generate_queries=self.config.include_queries, - generate_usage_statistics=self.config.include_usage_statistics, - generate_query_usage_statistics=self.config.include_query_usage_statistics, - usage_config=BaseUsageConfig( - bucket_duration=self.config.window.bucket_duration, - start_time=self.config.window.start_time, - end_time=self.config.window.end_time, - user_email_pattern=self.config.user_email_pattern, - # TODO make the rest of the fields configurable - ), - generate_operations=self.config.include_operations, - is_temp_table=self.is_temp_table, - is_allowed_table=self.is_allowed_table, - format_queries=False, + # The exit stack helps ensure that we close all the resources we open. + self._exit_stack = contextlib.ExitStack() + + self.aggregator: SqlParsingAggregator = self._exit_stack.enter_context( + SqlParsingAggregator( + platform=self.identifiers.platform, + platform_instance=self.identifiers.identifier_config.platform_instance, + env=self.identifiers.identifier_config.env, + schema_resolver=schema_resolver, + graph=graph, + eager_graph_load=False, + generate_lineage=self.config.include_lineage, + generate_queries=self.config.include_queries, + generate_usage_statistics=self.config.include_usage_statistics, + generate_query_usage_statistics=self.config.include_query_usage_statistics, + usage_config=BaseUsageConfig( + bucket_duration=self.config.window.bucket_duration, + start_time=self.config.window.start_time, + end_time=self.config.window.end_time, + user_email_pattern=self.config.user_email_pattern, + # TODO make the rest of the fields configurable + ), + generate_operations=self.config.include_operations, + is_temp_table=self.is_temp_table, + is_allowed_table=self.is_allowed_table, + format_queries=False, + ) ) self.report.sql_aggregator = self.aggregator.report @@ -248,6 +255,10 @@ def get_workunits_internal( self.aggregator.add(query) yield from auto_workunit(self.aggregator.gen_metadata()) + if not use_cached_audit_log: + queries.close() + shared_connection.close() + audit_log_file.unlink(missing_ok=True) def fetch_copy_history(self) -> Iterable[KnownLineageMapping]: # Derived from _populate_external_lineage_from_copy_history. @@ -426,6 +437,9 @@ def _parse_audit_log_row(self, row: Dict[str, Any]) -> PreparsedQuery: ) return entry + def close(self) -> None: + self._exit_stack.close() + class SnowflakeQueriesSource(Source): def __init__(self, ctx: PipelineContext, config: SnowflakeQueriesSourceConfig): @@ -468,6 +482,10 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: def get_report(self) -> SnowflakeQueriesSourceReport: return self.report + def close(self) -> None: + self.connection.close() + self.queries_extractor.close() + # Make sure we don't try to generate too much info for a single query. _MAX_TABLES_PER_QUERY = 20 diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 0d7881f36554d..dd7f73268fdc4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -1,3 +1,4 @@ +import contextlib import functools import json import logging @@ -149,7 +150,12 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): cached_domains=[k for k in self.config.domain], graph=self.ctx.graph ) - self.connection = self.config.get_connection() + # The exit stack helps ensure that we close all the resources we open. + self._exit_stack = contextlib.ExitStack() + + self.connection: SnowflakeConnection = self._exit_stack.enter_context( + self.config.get_connection() + ) # For database, schema, tables, views, etc self.data_dictionary = SnowflakeDataDictionary(connection=self.connection) @@ -157,25 +163,27 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): self.aggregator: Optional[SqlParsingAggregator] = None if self.config.use_queries_v2 or self.config.include_table_lineage: - self.aggregator = SqlParsingAggregator( - platform=self.identifiers.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - graph=self.ctx.graph, - eager_graph_load=( - # If we're ingestion schema metadata for tables/views, then we will populate - # schemas into the resolver as we go. We only need to do a bulk fetch - # if we're not ingesting schema metadata as part of ingestion. - not ( - self.config.include_technical_schema - and self.config.include_tables - and self.config.include_views - ) - and not self.config.lazy_schema_resolver - ), - generate_usage_statistics=False, - generate_operations=False, - format_queries=self.config.format_sql_queries, + self.aggregator = self._exit_stack.enter_context( + SqlParsingAggregator( + platform=self.identifiers.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + graph=self.ctx.graph, + eager_graph_load=( + # If we're ingestion schema metadata for tables/views, then we will populate + # schemas into the resolver as we go. We only need to do a bulk fetch + # if we're not ingesting schema metadata as part of ingestion. + not ( + self.config.include_technical_schema + and self.config.include_tables + and self.config.include_views + ) + and not self.config.lazy_schema_resolver + ), + generate_usage_statistics=False, + generate_operations=False, + format_queries=self.config.format_sql_queries, + ) ) self.report.sql_aggregator = self.aggregator.report @@ -191,14 +199,16 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): pipeline_name=self.ctx.pipeline_name, run_id=self.ctx.run_id, ) - self.lineage_extractor = SnowflakeLineageExtractor( - config, - self.report, - connection=self.connection, - filters=self.filters, - identifiers=self.identifiers, - redundant_run_skip_handler=redundant_lineage_run_skip_handler, - sql_aggregator=self.aggregator, + self.lineage_extractor = self._exit_stack.enter_context( + SnowflakeLineageExtractor( + config, + self.report, + connection=self.connection, + filters=self.filters, + identifiers=self.identifiers, + redundant_run_skip_handler=redundant_lineage_run_skip_handler, + sql_aggregator=self.aggregator, + ) ) self.usage_extractor: Optional[SnowflakeUsageExtractor] = None @@ -213,13 +223,15 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): pipeline_name=self.ctx.pipeline_name, run_id=self.ctx.run_id, ) - self.usage_extractor = SnowflakeUsageExtractor( - config, - self.report, - connection=self.connection, - filter=self.filters, - identifiers=self.identifiers, - redundant_run_skip_handler=redundant_usage_run_skip_handler, + self.usage_extractor = self._exit_stack.enter_context( + SnowflakeUsageExtractor( + config, + self.report, + connection=self.connection, + filter=self.filters, + identifiers=self.identifiers, + redundant_run_skip_handler=redundant_usage_run_skip_handler, + ) ) self.profiling_state_handler: Optional[ProfilingHandler] = None @@ -444,10 +456,6 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self._snowflake_clear_ocsp_cache() - self.connection = self.config.get_connection() - if self.connection is None: - return - self.inspect_session_metadata(self.connection) snowsight_url_builder = None @@ -513,7 +521,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: schema_resolver = self.aggregator._schema_resolver - queries_extractor = SnowflakeQueriesExtractor( + queries_extractor: SnowflakeQueriesExtractor = SnowflakeQueriesExtractor( connection=self.connection, config=SnowflakeQueriesExtractorConfig( window=self.config, @@ -535,6 +543,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # it should be pretty straightforward to refactor this and only initialize the aggregator once. self.report.queries_extractor = queries_extractor.report yield from queries_extractor.get_workunits_internal() + queries_extractor.close() else: if self.config.include_table_lineage and self.lineage_extractor: @@ -723,7 +732,4 @@ def _snowflake_clear_ocsp_cache(self) -> None: def close(self) -> None: super().close() StatefulIngestionSourceBase.close(self) - if self.lineage_extractor: - self.lineage_extractor.close() - if self.usage_extractor: - self.usage_extractor.close() + self._exit_stack.close() diff --git a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py index 5f2709fe42660..0b7ad14a8c1b4 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sql_parsing_aggregator.py @@ -275,6 +275,8 @@ class SqlAggregatorReport(Report): tool_meta_report: Optional[ToolMetaExtractorReport] = None def compute_stats(self) -> None: + if self._aggregator._closed: + return self.schema_resolver_count = self._aggregator._schema_resolver.schema_count() self.num_unique_query_fingerprints = len(self._aggregator._query_map) @@ -345,6 +347,7 @@ def __init__( # The exit stack helps ensure that we close all the resources we open. self._exit_stack = contextlib.ExitStack() + self._closed: bool = False # Set up the schema resolver. self._schema_resolver: SchemaResolver @@ -456,12 +459,16 @@ def __init__( shared_connection=self._shared_connection, tablename="query_usage_counts", ) + self._exit_stack.push(self._query_usage_counts) # Tool Extractor self._tool_meta_extractor = ToolMetaExtractor() self.report.tool_meta_report = self._tool_meta_extractor.report def close(self) -> None: + # Compute stats once before closing connections + self.report.compute_stats() + self._closed = True self._exit_stack.close() @property diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py index 9290100b0c521..ef846f698f156 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery_queries.py @@ -1,4 +1,5 @@ import json +import os from datetime import datetime from pathlib import Path from unittest.mock import patch @@ -6,7 +7,9 @@ import pytest from freezegun import freeze_time +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.bigquery_v2.bigquery_queries import ( + BigQueryQueriesSource, BigQueryQueriesSourceReport, ) from datahub.metadata.urns import CorpUserUrn @@ -93,3 +96,16 @@ def test_queries_ingestion(project_client, client, pytestconfig, monkeypatch, tm output_path=mcp_output_path, golden_path=mcp_golden_path, ) + + +@patch("google.cloud.bigquery.Client") +@patch("google.cloud.resourcemanager_v3.ProjectsClient") +def test_source_close_cleans_tmp(projects_client, client, tmp_path): + with patch("tempfile.tempdir", str(tmp_path)): + source = BigQueryQueriesSource.create( + {"project_ids": ["project1"]}, PipelineContext("run-id") + ) + assert len(os.listdir(tmp_path)) > 0 + # This closes QueriesExtractor which in turn closes SqlParsingAggregator + source.close() + assert len(os.listdir(tmp_path)) == 0 diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py new file mode 100644 index 0000000000000..82f5691bcee3d --- /dev/null +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_queries.py @@ -0,0 +1,24 @@ +import os +from unittest.mock import patch + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.snowflake.snowflake_queries import SnowflakeQueriesSource + + +@patch("snowflake.connector.connect") +def test_source_close_cleans_tmp(snowflake_connect, tmp_path): + with patch("tempfile.tempdir", str(tmp_path)): + source = SnowflakeQueriesSource.create( + { + "connection": { + "account_id": "ABC12345.ap-south-1.aws", + "username": "TST_USR", + "password": "TST_PWD", + } + }, + PipelineContext("run-id"), + ) + assert len(os.listdir(tmp_path)) > 0 + # This closes QueriesExtractor which in turn closes SqlParsingAggregator + source.close() + assert len(os.listdir(tmp_path)) == 0 diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py index 0d21936a74d07..849d550ef69c5 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sql_aggregator.py @@ -1,5 +1,7 @@ +import os import pathlib from datetime import datetime, timezone +from unittest.mock import patch import pytest from freezegun import freeze_time @@ -661,3 +663,23 @@ def test_basic_usage(pytestconfig: pytest.Config) -> None: outputs=mcps, golden_path=RESOURCE_DIR / "test_basic_usage.json", ) + + +def test_sql_aggreator_close_cleans_tmp(tmp_path): + frozen_timestamp = parse_user_datetime(FROZEN_TIME) + with patch("tempfile.tempdir", str(tmp_path)): + aggregator = SqlParsingAggregator( + platform="redshift", + generate_lineage=False, + generate_usage_statistics=True, + generate_operations=False, + usage_config=BaseUsageConfig( + start_time=get_time_bucket(frozen_timestamp, BucketDuration.DAY), + end_time=frozen_timestamp, + ), + generate_queries=True, + generate_query_usage_statistics=True, + ) + assert len(os.listdir(tmp_path)) > 0 + aggregator.close() + assert len(os.listdir(tmp_path)) == 0 From 011e5b97e7c0f0d330edae59368cc5f8c8512126 Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Fri, 18 Oct 2024 01:12:05 -0700 Subject: [PATCH 07/17] =?UTF-8?q?fix(sdk):=20platform=20resource=20-=20sup?= =?UTF-8?q?port=20indexed=20queries=20when=20urns=20are=20i=E2=80=A6=20(#1?= =?UTF-8?q?1660)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../platformresource/platform_resource.py | 41 +++++++++- .../test_platform_resource.py | 74 +++++++++++++++++++ 2 files changed, 113 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py index 1556a67a9e555..349b0ff11d84f 100644 --- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py +++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py @@ -85,6 +85,7 @@ def scroll_urns_by_filter( self, entity_type: str, extra_or_filters: List[Dict[str, str]], + extra_and_filters: List[Dict[str, str]] = [], ) -> Iterable[str]: """ Scroll through all urns that match the given filters @@ -92,10 +93,26 @@ def scroll_urns_by_filter( key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type) assert key_aspect, f"No key aspect found for entity type {entity_type}" + if extra_or_filters and extra_and_filters: + raise ValueError( + "Only one of extra_or_filters and extra_and_filters should be provided" + ) count = 1000 - query = " OR ".join( - [f"{filter['field']}:{filter['value']}" for filter in extra_or_filters] + query = ( + " OR ".join( + [ + f"{filter['field']}:\"{filter['value']}\"" + for filter in extra_or_filters + ] + ) + if extra_or_filters + else " AND ".join( + [ + f"{filter['field']}:\"{filter['value']}\"" + for filter in extra_and_filters + ] + ) ) scroll_id = None while True: @@ -252,3 +269,23 @@ def search_by_key( def delete(self, graph_client: DataHubGraph, hard: bool = True) -> None: graph_client.delete_entity(str(PlatformResourceUrn(self.id)), hard=hard) + + @staticmethod + def search_by_filters( + graph_client: DataHubGraph, + and_filters: List[Dict[str, str]] = [], + or_filters: List[Dict[str, str]] = [], + ) -> Iterable["PlatformResource"]: + if and_filters and or_filters: + raise ValueError( + "Only one of and_filters and or_filters should be provided" + ) + openapi_client = OpenAPIGraphClient(graph_client) + for urn in openapi_client.scroll_urns_by_filter( + entity_type="platformResource", + extra_or_filters=or_filters if or_filters else [], + extra_and_filters=and_filters if and_filters else [], + ): + platform_resource = PlatformResource.from_datahub(graph_client, urn) + if platform_resource: + yield platform_resource diff --git a/smoke-test/tests/platform_resources/test_platform_resource.py b/smoke-test/tests/platform_resources/test_platform_resource.py index 7c53f72d843c9..7ebfd4d6ea15b 100644 --- a/smoke-test/tests/platform_resources/test_platform_resource.py +++ b/smoke-test/tests/platform_resources/test_platform_resource.py @@ -112,3 +112,77 @@ def test_platform_resource_non_existent(graph_client, test_id): graph_client=graph_client, ) assert platform_resource is None + + +def test_platform_resource_urn_secondary_key(graph_client, test_id): + key = PlatformResourceKey( + platform=f"test_platform_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_{test_id}", + ) + dataset_urn = ( + f"urn:li:dataset:(urn:li:dataPlatform:test,test_secondary_key_{test_id},PROD)" + ) + platform_resource = PlatformResource.create( + key=key, + value={"test_key": f"test_value_{test_id}"}, + secondary_keys=[dataset_urn], + ) + platform_resource.to_datahub(graph_client) + wait_for_writes_to_sync() + + read_platform_resources = [ + r + for r in PlatformResource.search_by_key( + graph_client, dataset_urn, primary=False + ) + ] + assert len(read_platform_resources) == 1 + assert read_platform_resources[0] == platform_resource + + +def test_platform_resource_listing_by_resource_type(graph_client, test_id): + # Generate two resources with the same resource type + key1 = PlatformResourceKey( + platform=f"test_platform_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_1_{test_id}", + ) + platform_resource1 = PlatformResource.create( + key=key1, + value={"test_key": f"test_value_1_{test_id}"}, + ) + platform_resource1.to_datahub(graph_client) + + key2 = PlatformResourceKey( + platform=f"test_platform_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_2_{test_id}", + ) + platform_resource2 = PlatformResource.create( + key=key2, + value={"test_key": f"test_value_2_{test_id}"}, + ) + platform_resource2.to_datahub(graph_client) + + wait_for_writes_to_sync() + + search_results = [ + r + for r in PlatformResource.search_by_filters( + graph_client, + and_filters=[ + { + "field": "resourceType", + "condition": "EQUAL", + "value": key1.resource_type, + } + ], + ) + ] + assert len(search_results) == 2 + + read_platform_resource_1 = next(r for r in search_results if r.id == key1.id) + read_platform_resource_2 = next(r for r in search_results if r.id == key2.id) + assert read_platform_resource_1 == platform_resource1 + assert read_platform_resource_2 == platform_resource2 From dcf4793c3d8fb192b7ae3e3248f50445d59e8912 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Fri, 18 Oct 2024 03:13:18 -0700 Subject: [PATCH 08/17] fix(ingest/dbt): Prevent lineage cycles when parsing sql of dbt models (#11666) --- .../ingestion/source/dbt/dbt_common.py | 6 +++ .../tests/unit/test_dbt_source.py | 42 +++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 4cd3c934ce634..c95d0e545c598 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -1989,6 +1989,11 @@ def _translate_dbt_name_to_upstream_urn(dbt_name: str) -> str: time=mce_builder.get_sys_time(), actor=_DEFAULT_ACTOR, ) + sibling_urn = node.get_urn( + self.config.target_platform, + self.config.env, + self.config.target_platform_instance, + ) return UpstreamLineageClass( upstreams=[ UpstreamClass( @@ -1997,6 +2002,7 @@ def _translate_dbt_name_to_upstream_urn(dbt_name: str) -> str: auditStamp=auditStamp, ) for upstream in upstream_urns + if not (node.node_type == "model" and upstream == sibling_urn) ], fineGrainedLineages=( (cll or None) if self.config.include_column_lineage else None diff --git a/metadata-ingestion/tests/unit/test_dbt_source.py b/metadata-ingestion/tests/unit/test_dbt_source.py index 7d01ecd034523..f0d4c3408271f 100644 --- a/metadata-ingestion/tests/unit/test_dbt_source.py +++ b/metadata-ingestion/tests/unit/test_dbt_source.py @@ -10,6 +10,7 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.dbt import dbt_cloud from datahub.ingestion.source.dbt.dbt_cloud import DBTCloudConfig +from datahub.ingestion.source.dbt.dbt_common import DBTNode from datahub.ingestion.source.dbt.dbt_core import ( DBTCoreConfig, DBTCoreSource, @@ -253,6 +254,47 @@ def test_dbt_config_prefer_sql_parser_lineage(): assert config.prefer_sql_parser_lineage is True +def test_dbt_prefer_sql_parser_lineage_no_self_reference(): + ctx = PipelineContext(run_id="test-run-id") + config = DBTCoreConfig.parse_obj( + { + **create_base_dbt_config(), + "skip_sources_in_lineage": True, + "prefer_sql_parser_lineage": True, + } + ) + source: DBTCoreSource = DBTCoreSource(config, ctx, "dbt") + all_nodes_map = { + "model1": DBTNode( + name="model1", + database=None, + schema=None, + alias=None, + comment="", + description="", + language=None, + raw_code=None, + dbt_adapter="postgres", + dbt_name="model1", + dbt_file_path=None, + dbt_package_name=None, + node_type="model", + materialization="table", + max_loaded_at=None, + catalog_type=None, + missing_from_catalog=False, + owner=None, + compiled_code="SELECT d FROM results WHERE d > (SELECT MAX(d) FROM model1)", + ), + } + source._infer_schemas_and_update_cll(all_nodes_map) + upstream_lineage = source._create_lineage_aspect_for_dbt_node( + all_nodes_map["model1"], all_nodes_map + ) + assert upstream_lineage is not None + assert len(upstream_lineage.upstreams) == 1 + + def test_dbt_s3_config(): # test missing aws config config_dict: dict = { From ba7a43f530cfe21c179e32c8712eb040b83a4c13 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Fri, 18 Oct 2024 20:16:08 +0200 Subject: [PATCH 09/17] fix(ingest/dagster): Fix JobSnapshot import is broken (#11672) --- .../datahub_dagster_plugin/client/dagster_generator.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py index a2cf159dd12f6..df123b127e040 100644 --- a/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py +++ b/metadata-ingestion-modules/dagster-plugin/src/datahub_dagster_plugin/client/dagster_generator.py @@ -12,7 +12,13 @@ TableSchemaMetadataValue, ) from dagster._core.execution.stats import RunStepKeyStatsSnapshot, StepEventStatus -from dagster._core.snap import JobSnapshot + +try: + from dagster._core.snap import JobSnapshot # type: ignore[attr-defined] +except ImportError: + # Import changed since Dagster 1.8.12 to this -> https://github.com/dagster-io/dagster/commit/29a37d1f0260cfd112849633d1096ffc916d6c95 + from dagster._core.snap import JobSnap as JobSnapshot + from dagster._core.snap.node import OpDefSnap from dagster._core.storage.dagster_run import DagsterRun, DagsterRunStatsSnapshot from datahub.api.entities.datajob import DataFlow, DataJob From 72d1236669c4052488f9ff23517f4568edbe6610 Mon Sep 17 00:00:00 2001 From: Andrew Sikowitz Date: Fri, 18 Oct 2024 12:01:39 -0700 Subject: [PATCH 10/17] feat(ingest/transformer/domain): Add support for on conflict do nothing to dataset domain transformers (#11649) --- .../docs/transformer/dataset_transformer.md | 28 +++---- .../src/datahub/ingestion/graph/client.py | 1 + .../ingestion/transformer/dataset_domain.py | 41 ++++++---- .../tests/unit/test_transform_dataset.py | 76 +++++++++++++++++++ 4 files changed, 119 insertions(+), 27 deletions(-) diff --git a/metadata-ingestion/docs/transformer/dataset_transformer.md b/metadata-ingestion/docs/transformer/dataset_transformer.md index d48c6d2c1ab5b..66274ce64a8d2 100644 --- a/metadata-ingestion/docs/transformer/dataset_transformer.md +++ b/metadata-ingestion/docs/transformer/dataset_transformer.md @@ -122,12 +122,13 @@ transformers: ``` ## Simple Add Dataset ownership ### Config Details -| Field | Required | Type | Default | Description | -|--------------------|----------|--------------|-------------|---------------------------------------------------------------------| -| `owner_urns` | ✅ | list[string] | | List of owner urns. | -| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | -| `replace_existing` | | boolean | `false` | Whether to remove ownership from entity sent by ingestion source. | -| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| Field | Required | Type | Default | Description | +|--------------------|----------|--------------|-------------|------------------------------------------------------------------------------------------------------------| +| `owner_urns` | ✅ | list[string] | | List of owner urns. | +| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | +| `replace_existing` | | boolean | `false` | Whether to remove ownership from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| `on_conflict` | | enum | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant. | For transformer behaviour on `replace_existing` and `semantics`, please refer section [Relationship Between replace_existing And semantics](#relationship-between-replace_existing-and-semantics). @@ -191,13 +192,14 @@ transformers: ## Pattern Add Dataset ownership ### Config Details -| Field | Required | Type | Default | Description | -|--------------------|----------|----------------------|-------------|-----------------------------------------------------------------------------------------| -| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. | -| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | -| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | -| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | -| `is_container` | | bool | `false` | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. | +| Field | Required | Type | Default | Description | +|--------------------|----------|----------------------|-------------|------------------------------------------------------------------------------------------------------------------------------| +| `owner_pattern` | ✅ | map[regx, list[urn]] | | entity urn with regular expression and list of owners urn apply to matching entity urn. | +| `ownership_type` | | string | "DATAOWNER" | ownership type of the owners (either as enum or ownership type urn) | +| `replace_existing` | | boolean | `false` | Whether to remove owners from entity sent by ingestion source. | +| `semantics` | | enum | `OVERWRITE` | Whether to OVERWRITE or PATCH the entity present on DataHub GMS. | +| `is_container` | | bool | `false` | Whether to also consider a container or not. If true, then ownership will be attached to both the dataset and its container. | +| `on_conflict` | | enum | `DO_UPDATE` | Whether to make changes if domains already exist. If set to DO_NOTHING, `semantics` setting is irrelevant. | let’s suppose we’d like to append a series of users who we know to own a different dataset from a data source but aren't detected during normal ingestion. To do so, we can use the `pattern_add_dataset_ownership` module that’s included in the ingestion framework. This will match the pattern to `urn` of the dataset and assign the respective owners. diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index e8fae6254ae88..1d2528a24c4e5 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -351,6 +351,7 @@ def get_tags(self, entity_urn: str) -> Optional[GlobalTagsClass]: def get_glossary_terms(self, entity_urn: str) -> Optional[GlossaryTermsClass]: return self.get_aspect(entity_urn=entity_urn, aspect_type=GlossaryTermsClass) + @functools.lru_cache(maxsize=1) def get_domain(self, entity_urn: str) -> Optional[DomainsClass]: return self.get_aspect(entity_urn=entity_urn, aspect_type=DomainsClass) diff --git a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py index 6a83824815265..6b78b71eaa78e 100644 --- a/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py +++ b/metadata-ingestion/src/datahub/ingestion/transformer/dataset_domain.py @@ -1,6 +1,8 @@ import logging +from enum import auto from typing import Callable, Dict, List, Optional, Sequence, Union, cast +from datahub.configuration._config_enum import ConfigEnum from datahub.configuration.common import ( ConfigurationError, KeyValuePattern, @@ -23,6 +25,13 @@ logger = logging.getLogger(__name__) +class TransformerOnConflict(ConfigEnum): + """Describes the behavior of the transformer when writing an aspect that already exists.""" + + DO_UPDATE = auto() # On conflict, apply the new aspect + DO_NOTHING = auto() # On conflict, do not apply the new aspect + + class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): get_domains_to_add: Union[ Callable[[str], DomainsClass], @@ -32,10 +41,12 @@ class AddDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): _resolve_domain_fn = pydantic_resolve_key("get_domains_to_add") is_container: bool = False + on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE class SimpleDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): domains: List[str] + on_conflict: TransformerOnConflict = TransformerOnConflict.DO_UPDATE class PatternDatasetDomainSemanticsConfig(TransformerSemanticsConfigModel): @@ -80,12 +91,13 @@ def get_domain_class( @staticmethod def _merge_with_server_domains( - graph: DataHubGraph, urn: str, mce_domain: Optional[DomainsClass] + graph: Optional[DataHubGraph], urn: str, mce_domain: Optional[DomainsClass] ) -> Optional[DomainsClass]: if not mce_domain or not mce_domain.domains: # nothing to add, no need to consult server return None + assert graph server_domain = graph.get_domain(entity_urn=urn) if server_domain: # compute patch @@ -155,7 +167,7 @@ def transform_aspect( self, entity_urn: str, aspect_name: str, aspect: Optional[Aspect] ) -> Optional[Aspect]: in_domain_aspect: DomainsClass = cast(DomainsClass, aspect) - domain_aspect = DomainsClass(domains=[]) + domain_aspect: DomainsClass = DomainsClass(domains=[]) # Check if we have received existing aspect if in_domain_aspect is not None and self.config.replace_existing is False: domain_aspect.domains.extend(in_domain_aspect.domains) @@ -164,16 +176,18 @@ def transform_aspect( domain_aspect.domains.extend(domain_to_add.domains) - if self.config.semantics == TransformerSemantics.PATCH: - assert self.ctx.graph - patch_domain_aspect: Optional[ - DomainsClass - ] = AddDatasetDomain._merge_with_server_domains( - self.ctx.graph, entity_urn, domain_aspect - ) - return cast(Optional[Aspect], patch_domain_aspect) - - return cast(Optional[Aspect], domain_aspect) + final_aspect: Optional[DomainsClass] = domain_aspect + if domain_aspect.domains: + if self.config.on_conflict == TransformerOnConflict.DO_NOTHING: + assert self.ctx.graph + server_domain = self.ctx.graph.get_domain(entity_urn) + if server_domain and server_domain.domains: + return None + if self.config.semantics == TransformerSemantics.PATCH: + final_aspect = AddDatasetDomain._merge_with_server_domains( + self.ctx.graph, entity_urn, domain_aspect + ) + return cast(Optional[Aspect], final_aspect) class SimpleAddDatasetDomain(AddDatasetDomain): @@ -186,8 +200,7 @@ def __init__( domains = AddDatasetDomain.get_domain_class(ctx.graph, config.domains) generic_config = AddDatasetDomainSemanticsConfig( get_domains_to_add=lambda _: domains, - semantics=config.semantics, - replace_existing=config.replace_existing, + **config.dict(exclude={"domains"}), ) super().__init__(generic_config, ctx) diff --git a/metadata-ingestion/tests/unit/test_transform_dataset.py b/metadata-ingestion/tests/unit/test_transform_dataset.py index 2e2e85b5d1811..4e9a38cb37ae6 100644 --- a/metadata-ingestion/tests/unit/test_transform_dataset.py +++ b/metadata-ingestion/tests/unit/test_transform_dataset.py @@ -56,6 +56,7 @@ from datahub.ingestion.transformer.dataset_domain import ( PatternAddDatasetDomain, SimpleAddDatasetDomain, + TransformerOnConflict, ) from datahub.ingestion.transformer.dataset_domain_based_on_tags import ( DatasetTagDomainMapper, @@ -2498,6 +2499,81 @@ def fake_get_domain(entity_urn: str) -> models.DomainsClass: assert server_domain in transformed_aspect.domains +def test_simple_add_dataset_domain_on_conflict_do_nothing( + pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance +): + acryl_domain = builder.make_domain_urn("acryl.io") + datahub_domain = builder.make_domain_urn("datahubproject.io") + server_domain = builder.make_domain_urn("test.io") + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + # Return fake aspect to simulate server behaviour + def fake_get_domain(entity_urn: str) -> models.DomainsClass: + return models.DomainsClass(domains=[server_domain]) + + pipeline_context.graph.get_domain = fake_get_domain # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[datahub_domain]), + config={ + "replace_existing": False, + "semantics": TransformerSemantics.PATCH, + "domains": [acryl_domain], + "on_conflict": TransformerOnConflict.DO_NOTHING, + }, + pipeline_context=pipeline_context, + ) + + assert len(output) == 1 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, EndOfStream) + + +def test_simple_add_dataset_domain_on_conflict_do_nothing_no_conflict( + pytestconfig, tmp_path, mock_time, mock_datahub_graph_instance +): + acryl_domain = builder.make_domain_urn("acryl.io") + datahub_domain = builder.make_domain_urn("datahubproject.io") + irrelevant_domain = builder.make_domain_urn("test.io") + + pipeline_context = PipelineContext(run_id="transformer_pipe_line") + pipeline_context.graph = mock_datahub_graph_instance + + # Return fake aspect to simulate server behaviour + def fake_get_domain(entity_urn: str) -> models.DomainsClass: + return models.DomainsClass(domains=[]) + + pipeline_context.graph.get_domain = fake_get_domain # type: ignore + + output = run_dataset_transformer_pipeline( + transformer_type=SimpleAddDatasetDomain, + aspect=models.DomainsClass(domains=[datahub_domain]), + config={ + "replace_existing": False, + "semantics": TransformerSemantics.PATCH, + "domains": [acryl_domain], + "on_conflict": TransformerOnConflict.DO_NOTHING, + }, + pipeline_context=pipeline_context, + ) + + assert len(output) == 2 + assert output[0] is not None + assert output[0].record is not None + assert isinstance(output[0].record, MetadataChangeProposalWrapper) + assert output[0].record.aspect is not None + assert isinstance(output[0].record.aspect, models.DomainsClass) + transformed_aspect = cast(models.DomainsClass, output[0].record.aspect) + assert len(transformed_aspect.domains) == 2 + assert datahub_domain in transformed_aspect.domains + assert acryl_domain in transformed_aspect.domains + assert irrelevant_domain not in transformed_aspect.domains + + def test_pattern_add_dataset_domain_aspect_name(mock_datahub_graph_instance): pipeline_context: PipelineContext = PipelineContext( run_id="test_simple_add_dataset_domain" From 7e4d4aba122f3ec22825ef5005f759172be26678 Mon Sep 17 00:00:00 2001 From: Jay Feldman <8128360+feldjay@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:58:52 -0400 Subject: [PATCH 11/17] fix(ingest/looker): Remove bad imports from looker_common (#11663) --- .../src/datahub/ingestion/source/looker/looker_common.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 3cbb13375229b..317b212f7fa96 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -929,7 +929,6 @@ def from_api( # noqa: C901 reporter: SourceReport, source_config: LookerDashboardSourceConfig, ) -> Optional["LookerExplore"]: # noqa: C901 - from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME try: explore = client.lookml_model_explore(model, explore_name) @@ -1194,7 +1193,6 @@ def _to_metadata_events( # noqa: C901 ) -> Optional[List[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]]: # We only generate MCE-s for explores that contain from clauses and do NOT contain joins # All other explores (passthrough explores and joins) end in correct resolution of lineage, and don't need additional nodes in the graph. - from datahub.ingestion.source.looker.lookml_source import _BASE_PROJECT_NAME dataset_snapshot = DatasetSnapshot( urn=self.get_explore_urn(config), From dfd7293f8ded30f6db5a3a13b045a2c9b5fdf21d Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 18 Oct 2024 13:05:06 -0700 Subject: [PATCH 12/17] feat(ingest/looker): include project name in model/explore properties (#11664) Co-authored-by: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> --- .../ingestion/source/looker/looker_common.py | 16 ++-- .../ingestion/source/looker/looker_source.py | 41 ++++++---- .../looker/golden_looker_mces.json | 7 ++ .../looker/golden_test_allow_ingest.json | 4 + ...olden_test_external_project_view_mces.json | 4 + .../looker/golden_test_file_path_ingest.json | 4 + ...olden_test_folder_path_pattern_ingest.json | 4 + .../golden_test_independent_look_ingest.json | 82 +++++++++++-------- .../looker/golden_test_ingest.json | 4 + .../looker/golden_test_ingest_joins.json | 4 + .../golden_test_ingest_unaliased_joins.json | 4 + ...en_test_non_personal_independent_look.json | 7 ++ .../looker_mces_golden_deleted_stateful.json | 16 ++-- .../looker/looker_mces_usage_history.json | 4 + 14 files changed, 135 insertions(+), 66 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 317b212f7fa96..3d1683100474e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -1205,15 +1205,19 @@ def _to_metadata_events( # noqa: C901 dataset_snapshot.aspects.append(browse_paths) dataset_snapshot.aspects.append(StatusClass(removed=False)) - custom_properties = {} - if self.label is not None: - custom_properties["looker.explore.label"] = str(self.label) - if self.source_file is not None: - custom_properties["looker.explore.file"] = str(self.source_file) + custom_properties = { + "project": self.project_name, + "model": self.model_name, + "looker.explore.label": self.label, + "looker.explore.name": self.name, + "looker.explore.file": self.source_file, + } dataset_props = DatasetPropertiesClass( name=str(self.label) if self.label else LookerUtil._display_name(self.name), description=self.description, - customProperties=custom_properties, + customProperties={ + k: str(v) for k, v in custom_properties.items() if v is not None + }, ) dataset_props.externalUrl = self._get_url(base_url) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index f269ccf1cd98f..e42ac7b61c177 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -139,26 +139,21 @@ class LookerDashboardSource(TestableSource, StatefulIngestionSourceBase): """ platform = "looker" - source_config: LookerDashboardSourceConfig - reporter: LookerDashboardSourceReport - user_registry: LookerUserRegistry - reachable_look_registry: Set[ - str - ] # Keep track of look-id which are reachable from Dashboard def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): super().__init__(config, ctx) - self.source_config = config - self.reporter = LookerDashboardSourceReport() + self.source_config: LookerDashboardSourceConfig = config + self.reporter: LookerDashboardSourceReport = LookerDashboardSourceReport() self.looker_api: LookerAPI = LookerAPI(self.source_config) - self.user_registry = LookerUserRegistry(self.looker_api) - self.explore_registry = LookerExploreRegistry( + self.user_registry: LookerUserRegistry = LookerUserRegistry(self.looker_api) + self.explore_registry: LookerExploreRegistry = LookerExploreRegistry( self.looker_api, self.reporter, self.source_config ) self.reporter._looker_explore_registry = self.explore_registry self.reporter._looker_api = self.looker_api - self.reachable_look_registry = set() + # Keep track of look-id which are reachable from Dashboard + self.reachable_look_registry: Set[str] = set() # (model, explore) -> list of charts/looks/dashboards that reference this explore # The list values are used purely for debugging purposes. @@ -868,21 +863,31 @@ def _make_explore_metadata_events( ) -> Iterable[ Union[MetadataChangeEvent, MetadataChangeProposalWrapper, MetadataWorkUnit] ]: - if self.source_config.emit_used_explores_only: - explores_to_fetch = list(self.reachable_explores.keys()) - else: + if not self.source_config.emit_used_explores_only: explores_to_fetch = list(self.list_all_explores()) + else: + # We don't keep track of project names for each explore right now. + # Because project names are just used for a custom property, it's + # fine to set them to None. + # TODO: Track project names for each explore. + explores_to_fetch = [ + (None, model, explore) + for (model, explore) in self.reachable_explores.keys() + ] explores_to_fetch.sort() processed_models: List[str] = [] - for model, _ in explores_to_fetch: + for project_name, model, _ in explores_to_fetch: if model not in processed_models: model_key = gen_model_key(self.source_config, model) yield from gen_containers( container_key=model_key, name=model, sub_types=[BIContainerSubTypes.LOOKML_MODEL], + extra_properties=( + {"project": project_name} if project_name is not None else None + ), ) yield MetadataChangeProposalWrapper( entityUrn=model_key.as_urn(), @@ -896,7 +901,7 @@ def _make_explore_metadata_events( self.reporter.total_explores = len(explores_to_fetch) for future in BackpressureAwareExecutor.map( self.fetch_one_explore, - ((model, explore) for (model, explore) in explores_to_fetch), + ((model, explore) for (_project, model, explore) in explores_to_fetch), max_workers=self.source_config.max_threads, ): events, explore_id, start_time, end_time = future.result() @@ -907,7 +912,7 @@ def _make_explore_metadata_events( f"Running time of fetch_one_explore for {explore_id}: {(end_time - start_time).total_seconds()}" ) - def list_all_explores(self) -> Iterable[Tuple[str, str]]: + def list_all_explores(self) -> Iterable[Tuple[Optional[str], str, str]]: # returns a list of (model, explore) tuples for model in self.looker_api.all_lookml_models(): @@ -916,7 +921,7 @@ def list_all_explores(self) -> Iterable[Tuple[str, str]]: for explore in model.explores: if explore.name is None: continue - yield (model.name, explore.name) + yield (model.project_name, model.name, explore.name) def fetch_one_explore( self, model: str, explore: str diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index 5cac7b1bb73b1..a9c445b5986ef 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -440,7 +441,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "bogus data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/bogus data/my_view", @@ -616,7 +620,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index 24a738a815cda..af9c62a2a4180 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -282,7 +283,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index b1460779da4f5..b89bc356b48fd 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -202,6 +202,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -520,7 +521,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "looker_hub", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index 74400b9b5cc56..810fefd8f6cb8 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -202,6 +202,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -520,7 +521,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "looker_hub", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json index 89241fb52fb63..3d78397f54a23 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_folder_path_pattern_ingest.json @@ -287,6 +287,7 @@ "description": "third", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -613,7 +614,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index f178e97e78fa0..5a540e61e768d 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -210,6 +210,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -1107,12 +1108,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/data" + "/Explore/sales_model" ] } }, @@ -1124,10 +1125,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "sales_model", "looker.explore.label": "My Explore View", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/data/my_view", + "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1149,7 +1153,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "my_view", + "schemaName": "sales_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1204,7 +1208,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1223,12 +1227,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/data/my_view" + "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" } }, "systemMetadata": { @@ -1240,12 +1244,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } }, "systemMetadata": { @@ -1257,7 +1261,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1267,8 +1271,8 @@ "id": "Explore" }, { - "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", - "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" + "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", + "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" } ] } @@ -1283,12 +1287,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/order_model" + "/Explore/data" ] } }, @@ -1300,10 +1304,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/order_model/order_explore", + "externalUrl": "https://looker.company.com/explore/data/my_view", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1325,7 +1332,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "order_explore", + "schemaName": "my_view", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1380,7 +1387,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1399,12 +1406,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" + "renderUrl": "https://looker.company.com/embed/explore/data/my_view" } }, "systemMetadata": { @@ -1416,12 +1423,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "container": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } }, "systemMetadata": { @@ -1433,7 +1440,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,data.explore.my_view,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1443,8 +1450,8 @@ "id": "Explore" }, { - "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", - "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" + "id": "urn:li:container:59a5aa45397364e6882e793f1bc77b42", + "urn": "urn:li:container:59a5aa45397364e6882e793f1bc77b42" } ] } @@ -1459,12 +1466,12 @@ { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.BrowsePaths": { "paths": [ - "/Explore/sales_model" + "/Explore/order_model" ] } }, @@ -1476,10 +1483,13 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "order_model", "looker.explore.label": "My Explore View", + "looker.explore.name": "order_explore", "looker.explore.file": "test_source_file.lkml" }, - "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", + "externalUrl": "https://looker.company.com/explore/order_model/order_explore", "name": "My Explore View", "description": "lorem ipsum", "tags": [] @@ -1501,7 +1511,7 @@ }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "sales_explore", + "schemaName": "order_explore", "platform": "urn:li:dataPlatform:looker", "version": 0, "created": { @@ -1556,7 +1566,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1575,12 +1585,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "embed", "aspect": { "json": { - "renderUrl": "https://looker.company.com/embed/explore/sales_model/sales_explore" + "renderUrl": "https://looker.company.com/embed/explore/order_model/order_explore" } }, "systemMetadata": { @@ -1592,12 +1602,12 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "container": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } }, "systemMetadata": { @@ -1609,7 +1619,7 @@ }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,sales_model.explore.sales_explore,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,order_model.explore.order_explore,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -1619,8 +1629,8 @@ "id": "Explore" }, { - "id": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5", - "urn": "urn:li:container:d38ab60586a6e39b4cf63f14946969c5" + "id": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60", + "urn": "urn:li:container:df4ee66abd19b668c88bfe4408f87e60" } ] } diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index d969ef62a96e5..9ac95b8482a47 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -229,6 +229,7 @@ "urn:li:chart:(looker,ap-south-1.dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -574,7 +575,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index 153db363c7828..3a2c6359ea63c 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -202,6 +202,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -520,7 +521,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index 98adbdc5b829e..007eee348aeaf 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -282,7 +283,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json index 63ffdda8c5b6f..859b9163d7aad 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_non_personal_independent_look.json @@ -210,6 +210,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -783,7 +784,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", @@ -959,7 +963,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "sales_model", "looker.explore.label": "My Explore View", + "looker.explore.name": "sales_explore", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/sales_model/sales_explore", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index 567ab78a14754..8256c984afb27 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -210,6 +210,7 @@ "urn:li:chart:(looker,dashboard_elements.2)" ], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -539,7 +540,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", @@ -810,8 +814,8 @@ } }, { - "entityType": "chart", - "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -827,8 +831,8 @@ } }, { - "entityType": "container", - "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7", + "entityType": "chart", + "entityUrn": "urn:li:chart:(looker,dashboard_elements.10)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -844,8 +848,8 @@ } }, { - "entityType": "dashboard", - "entityUrn": "urn:li:dashboard:(looker,dashboards.11)", + "entityType": "container", + "entityUrn": "urn:li:container:621eb6e00da9abece0f64522f81be0e7", "changeType": "UPSERT", "aspectName": "status", "aspect": { diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 3befb62a631de..0b3530f9c2462 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -11,6 +11,7 @@ "description": "lorem ipsum", "charts": [], "datasets": [], + "dashboards": [], "lastModified": { "created": { "time": 1586847600000, @@ -234,7 +235,10 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { + "project": "lkml_samples", + "model": "data", "looker.explore.label": "My Explore View", + "looker.explore.name": "my_view", "looker.explore.file": "test_source_file.lkml" }, "externalUrl": "https://looker.company.com/explore/data/my_view", From 8f7f2c17242fb084f8784f015f76808bfda9b593 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 18 Oct 2024 14:29:03 -0700 Subject: [PATCH 13/17] feat(ingest/fivetran): protect against high sync volume (#11589) --- .../ingestion/source/fivetran/fivetran.py | 21 ++++--- .../source/fivetran/fivetran_log_api.py | 63 ++++++++++--------- .../source/fivetran/fivetran_query.py | 34 +++++++--- .../integration/fivetran/test_fivetran.py | 58 ++++------------- 4 files changed, 87 insertions(+), 89 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py index 704a6f20a5c19..334bb58ea84f8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -27,7 +27,10 @@ PlatformDetail, ) from datahub.ingestion.source.fivetran.data_classes import Connector, Job -from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI +from datahub.ingestion.source.fivetran.fivetran_log_api import ( + MAX_JOBS_PER_CONNECTOR, + FivetranLogAPI, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, ) @@ -72,11 +75,6 @@ def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext): self.audit_log = FivetranLogAPI(self.config.fivetran_log_config) - # Create and register the stateful ingestion use-case handler. - self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( - self, self.config, self.ctx - ) - def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: input_dataset_urn_list: List[DatasetUrn] = [] output_dataset_urn_list: List[DatasetUrn] = [] @@ -267,6 +265,13 @@ def _get_connector_workunits( ).as_workunit(is_primary_source=False) # Map Fivetran's job/sync history entity with Datahub's data process entity + if len(connector.jobs) >= MAX_JOBS_PER_CONNECTOR: + self.report.warning( + title="Not all sync history was captured", + message=f"The connector had more than {MAX_JOBS_PER_CONNECTOR} sync runs in the past {self.config.history_sync_lookback_period} days. " + f"Only the most recent {MAX_JOBS_PER_CONNECTOR} syncs were ingested.", + context=f"{connector.connector_name} (connector_id: {connector.connector_id})", + ) for job in connector.jobs: dpi = self._generate_dpi_from_job(job, datajob) yield from self._get_dpi_workunits(job, dpi) @@ -279,7 +284,9 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), - self.stale_entity_removal_handler.workunit_processor, + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py index 31c16139066e4..5908efe39e2b4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py @@ -22,6 +22,10 @@ logger: logging.Logger = logging.getLogger(__name__) +# We don't want to generate a massive number of dataProcesses for a single connector. +# This is primarily used as a safeguard to prevent performance issues. +MAX_JOBS_PER_CONNECTOR = 1000 + class FivetranLogAPI: def __init__(self, fivetran_log_config: FivetranLogConfig) -> None: @@ -158,34 +162,32 @@ def _get_table_lineage( return table_lineage_list - def _get_all_connector_sync_logs(self, syncs_interval: int) -> Dict[str, Dict]: - sync_logs = {} - for row in self._query( - self.fivetran_log_query.get_sync_logs_query().format( - db_clause=self.fivetran_log_query.db_clause, - syncs_interval=syncs_interval, - ) - ): - if row[Constant.CONNECTOR_ID] not in sync_logs: - sync_logs[row[Constant.CONNECTOR_ID]] = { - row[Constant.SYNC_ID]: { - row["message_event"]: ( - row[Constant.TIME_STAMP].timestamp(), - row[Constant.MESSAGE_DATA], - ) - } - } - elif row[Constant.SYNC_ID] not in sync_logs[row[Constant.CONNECTOR_ID]]: - sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]] = { - row["message_event"]: ( - row[Constant.TIME_STAMP].timestamp(), - row[Constant.MESSAGE_DATA], - ) - } - else: - sync_logs[row[Constant.CONNECTOR_ID]][row[Constant.SYNC_ID]][ - row["message_event"] - ] = (row[Constant.TIME_STAMP].timestamp(), row[Constant.MESSAGE_DATA]) + def _get_all_connector_sync_logs( + self, syncs_interval: int, connector_ids: List[str] + ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]: + sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {} + + # Format connector_ids as a comma-separated string of quoted IDs + formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids) + + query = self.fivetran_log_query.get_sync_logs_query().format( + db_clause=self.fivetran_log_query.db_clause, + syncs_interval=syncs_interval, + max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR, + connector_ids=formatted_connector_ids, + ) + + for row in self._query(query): + connector_id = row[Constant.CONNECTOR_ID] + sync_id = row[Constant.SYNC_ID] + + if connector_id not in sync_logs: + sync_logs[connector_id] = {} + + sync_logs[connector_id][sync_id] = { + "sync_start": (row["start_time"].timestamp(), None), + "sync_end": (row["end_time"].timestamp(), row["end_message_data"]), + } return sync_logs @@ -244,7 +246,10 @@ def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None: def _fill_connectors_jobs( self, connectors: List[Connector], syncs_interval: int ) -> None: - sync_logs = self._get_all_connector_sync_logs(syncs_interval) + connector_ids = [connector.connector_id for connector in connectors] + sync_logs = self._get_all_connector_sync_logs( + syncs_interval, connector_ids=connector_ids + ) for connector in connectors: connector.jobs = self._get_jobs_list(sync_logs.get(connector.connector_id)) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py index d965f53ff554b..c4680b4b1037a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py @@ -37,14 +37,32 @@ def get_users_query(self) -> str: def get_sync_logs_query(self) -> str: return """ - SELECT connector_id, - sync_id, - message_event, - message_data, - time_stamp - FROM {db_clause}log - WHERE message_event in ('sync_start', 'sync_end') - and time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days'""" + WITH ranked_syncs AS ( + SELECT + connector_id, + sync_id, + MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time, + MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, + MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, + ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn + FROM {db_clause}log + WHERE message_event in ('sync_start', 'sync_end') + AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' + AND connector_id IN ({connector_ids}) + GROUP BY connector_id, sync_id + ) + SELECT + connector_id, + sync_id, + start_time, + end_time, + end_message_data + FROM ranked_syncs + WHERE rn <= {max_jobs_per_connector} + AND start_time IS NOT NULL + AND end_time IS NOT NULL + ORDER BY connector_id, end_time DESC + """ def get_table_lineage_query(self) -> str: return f""" diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py index 0f5d098ee39c4..33ac09e69a3c0 100644 --- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -101,64 +101,32 @@ def default_query_results( } ] elif query == fivetran_log_query.get_sync_logs_query().format( - db_clause=fivetran_log_query.db_clause, syncs_interval=7 + db_clause=fivetran_log_query.db_clause, + syncs_interval=7, + max_jobs_per_connector=1000, + connector_ids="'calendar_elected'", ): return [ { "connector_id": "calendar_elected", "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000), + "start_time": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000), + "end_time": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000), + "end_message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"', }, { "connector_id": "calendar_elected", "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000), + "start_time": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000), + "end_time": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000), + "end_message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"', }, { "connector_id": "calendar_elected", "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832", - "message_event": "sync_start", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 5, 403000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", - "message_event": "sync_end", - "message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"', - "time_stamp": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", - "message_event": "sync_end", - "message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"', - "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", - "message_event": "sync_end", - "message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"', - "time_stamp": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000), - }, - { - "connector_id": "calendar_elected", - "sync_id": "e773e1e9-c791-46f4-894f-8ch9b3dfc832", - "message_event": "sync_end", - "message_data": None, - "time_stamp": datetime.datetime(2023, 10, 3, 14, 37, 35, 478000), + "start_time": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000), + "end_time": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000), + "end_message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"', }, ] # Unreachable code From 3b1b76244dfc6121e866f9d8803f1491ee6f5cec Mon Sep 17 00:00:00 2001 From: Shirshanka Das Date: Sat, 19 Oct 2024 14:53:28 -0700 Subject: [PATCH 14/17] feat(sdk):platform-resource - complex queries (#11675) --- .../platformresource/platform_resource.py | 193 ++++++------ .../src/datahub/utilities/openapi_utils.py | 69 +++++ .../src/datahub/utilities/search_utils.py | 285 ++++++++++++++++++ .../test_platform_resource.py | 15 + .../tests/unit/utilities/test_search_utils.py | 71 +++++ .../test_platform_resource.py | 78 ++++- 6 files changed, 617 insertions(+), 94 deletions(-) create mode 100644 metadata-ingestion/src/datahub/utilities/openapi_utils.py create mode 100644 metadata-ingestion/src/datahub/utilities/search_utils.py create mode 100644 metadata-ingestion/tests/unit/utilities/test_search_utils.py diff --git a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py index 349b0ff11d84f..0f7b10a067053 100644 --- a/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py +++ b/metadata-ingestion/src/datahub/api/entities/platformresource/platform_resource.py @@ -1,5 +1,5 @@ import logging -from typing import Dict, Iterable, List, Optional, Union +from typing import Callable, Dict, Iterable, List, Optional, Tuple, Type, Union, cast from avrogen.dict_wrapper import DictWrapper from pydantic import BaseModel @@ -14,7 +14,14 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import DatahubKey from datahub.ingestion.graph.client import DataHubGraph -from datahub.metadata.urns import PlatformResourceUrn +from datahub.metadata.urns import DataPlatformUrn, PlatformResourceUrn, Urn +from datahub.utilities.openapi_utils import OpenAPIGraphClient +from datahub.utilities.search_utils import ( + ElasticDocumentQuery, + ElasticsearchQueryBuilder, + LogicalOperator, + SearchField, +) logger = logging.getLogger(__name__) @@ -69,71 +76,75 @@ def to_resource_info(self) -> models.PlatformResourceInfoClass: ) -class OpenAPIGraphClient: +class DataPlatformInstanceUrn: + """ + A simple implementation of a URN class for DataPlatformInstance. + Since this is not present in the URN registry, we need to implement it here. + """ - ENTITY_KEY_ASPECT_MAP = { - aspect_type.ASPECT_INFO.get("keyForEntity"): name - for name, aspect_type in models.ASPECT_NAME_MAP.items() - if aspect_type.ASPECT_INFO.get("keyForEntity") - } + @staticmethod + def create_from_id(platform_instance_urn: str) -> Urn: + if platform_instance_urn.startswith("urn:li:platformInstance:"): + string_urn = platform_instance_urn + else: + string_urn = f"urn:li:platformInstance:{platform_instance_urn}" + return Urn.from_string(string_urn) - def __init__(self, graph: DataHubGraph): - self.graph = graph - self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3" - def scroll_urns_by_filter( - self, - entity_type: str, - extra_or_filters: List[Dict[str, str]], - extra_and_filters: List[Dict[str, str]] = [], - ) -> Iterable[str]: - """ - Scroll through all urns that match the given filters - """ +class UrnSearchField(SearchField): + """ + A search field that supports URN values. + TODO: Move this to search_utils after we make this more generic. + """ - key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type) - assert key_aspect, f"No key aspect found for entity type {entity_type}" - if extra_or_filters and extra_and_filters: - raise ValueError( - "Only one of extra_or_filters and extra_and_filters should be provided" - ) + def __init__(self, field_name: str, urn_value_extractor: Callable[[str], Urn]): + self.urn_value_extractor = urn_value_extractor + super().__init__(field_name) - count = 1000 - query = ( - " OR ".join( - [ - f"{filter['field']}:\"{filter['value']}\"" - for filter in extra_or_filters - ] - ) - if extra_or_filters - else " AND ".join( - [ - f"{filter['field']}:\"{filter['value']}\"" - for filter in extra_and_filters - ] - ) + def get_search_value(self, value: str) -> str: + return str(self.urn_value_extractor(value)) + + +class PlatformResourceSearchField(SearchField): + def __init__(self, field_name: str): + super().__init__(field_name) + + @classmethod + def from_search_field( + cls, search_field: SearchField + ) -> "PlatformResourceSearchField": + # pretends to be a class method, but just returns the input + return search_field # type: ignore + + +class PlatformResourceSearchFields: + PRIMARY_KEY = PlatformResourceSearchField("primaryKey") + RESOURCE_TYPE = PlatformResourceSearchField("resourceType") + SECONDARY_KEYS = PlatformResourceSearchField("secondaryKeys") + PLATFORM = PlatformResourceSearchField.from_search_field( + UrnSearchField( + field_name="platform.keyword", + urn_value_extractor=DataPlatformUrn.create_from_id, ) - scroll_id = None - while True: - response = self.graph._get_generic( - self.openapi_base + f"/entity/{entity_type.lower()}", - params={ - "systemMetadata": "false", - "includeSoftDelete": "false", - "skipCache": "false", - "aspects": [key_aspect], - "scrollId": scroll_id, - "count": count, - "query": query, - }, - ) - entities = response.get("entities", []) - scroll_id = response.get("scrollId") - for entity in entities: - yield entity["urn"] - if not scroll_id: - break + ) + PLATFORM_INSTANCE = PlatformResourceSearchField.from_search_field( + UrnSearchField( + field_name="platformInstance.keyword", + urn_value_extractor=DataPlatformInstanceUrn.create_from_id, + ) + ) + + +class ElasticPlatformResourceQuery(ElasticDocumentQuery[PlatformResourceSearchField]): + def __init__(self): + super().__init__() + + @classmethod + def create_from( + cls: Type["ElasticPlatformResourceQuery"], + *args: Tuple[Union[str, PlatformResourceSearchField], str], + ) -> "ElasticPlatformResourceQuery": + return cast(ElasticPlatformResourceQuery, super().create_from(*args)) class PlatformResource(BaseModel): @@ -147,6 +158,12 @@ def remove( cls, key: PlatformResourceKey, ) -> "PlatformResource": + """ + Creates a PlatformResource object with the removed status set to True. + Removed PlatformResource objects are used to soft-delete resources from + the graph. + To hard-delete a resource, use the delete method. + """ return cls( id=key.id, removed=True, @@ -240,28 +257,38 @@ def from_datahub( @staticmethod def search_by_key( - graph_client: DataHubGraph, key: str, primary: bool = True + graph_client: DataHubGraph, + key: str, + primary: bool = True, + is_exact: bool = True, ) -> Iterable["PlatformResource"]: - extra_or_filters = [] - extra_or_filters.append( - { - "field": "primaryKey", - "condition": "EQUAL", - "value": key, - } + """ + Searches for PlatformResource entities by primary or secondary key. + + :param graph_client: DataHubGraph client + :param key: The key to search for + :param primary: Whether to search for primary only or expand the search + to secondary keys (default: True) + :param is_exact: Whether to search for an exact match (default: True) + :return: An iterable of PlatformResource objects + """ + + elastic_platform_resource_group = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.OR) + .add_field_match( + PlatformResourceSearchFields.PRIMARY_KEY, key, is_exact=is_exact + ) ) if not primary: # we expand the search to secondary keys - extra_or_filters.append( - { - "field": "secondaryKeys", - "condition": "EQUAL", - "value": key, - } + elastic_platform_resource_group.add_field_match( + PlatformResourceSearchFields.SECONDARY_KEYS, key, is_exact=is_exact ) + query = elastic_platform_resource_group.end() openapi_client = OpenAPIGraphClient(graph_client) for urn in openapi_client.scroll_urns_by_filter( entity_type="platformResource", - extra_or_filters=extra_or_filters, + query=query, ): platform_resource = PlatformResource.from_datahub(graph_client, urn) if platform_resource: @@ -273,18 +300,16 @@ def delete(self, graph_client: DataHubGraph, hard: bool = True) -> None: @staticmethod def search_by_filters( graph_client: DataHubGraph, - and_filters: List[Dict[str, str]] = [], - or_filters: List[Dict[str, str]] = [], + query: Union[ + ElasticPlatformResourceQuery, + ElasticDocumentQuery, + ElasticsearchQueryBuilder, + ], ) -> Iterable["PlatformResource"]: - if and_filters and or_filters: - raise ValueError( - "Only one of and_filters and or_filters should be provided" - ) openapi_client = OpenAPIGraphClient(graph_client) for urn in openapi_client.scroll_urns_by_filter( entity_type="platformResource", - extra_or_filters=or_filters if or_filters else [], - extra_and_filters=and_filters if and_filters else [], + query=query, ): platform_resource = PlatformResource.from_datahub(graph_client, urn) if platform_resource: diff --git a/metadata-ingestion/src/datahub/utilities/openapi_utils.py b/metadata-ingestion/src/datahub/utilities/openapi_utils.py new file mode 100644 index 0000000000000..e704ff7f84cbb --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/openapi_utils.py @@ -0,0 +1,69 @@ +import logging +from typing import Iterable, Union + +import datahub.metadata.schema_classes as models +from datahub.ingestion.graph.client import DataHubGraph +from datahub.utilities.search_utils import ( + ElasticDocumentQuery, + ElasticsearchQueryBuilder, +) + +logger = logging.getLogger(__name__) + + +class OpenAPIGraphClient: + """ + An experimental client for the DataHubGraph that uses the OpenAPI endpoints + to query entities and aspects. + Does not support all features of the DataHubGraph. + API is subject to change. + + DO NOT USE THIS UNLESS YOU KNOW WHAT YOU ARE DOING. + """ + + ENTITY_KEY_ASPECT_MAP = { + aspect_type.ASPECT_INFO.get("keyForEntity"): name + for name, aspect_type in models.ASPECT_NAME_MAP.items() + if aspect_type.ASPECT_INFO.get("keyForEntity") + } + + def __init__(self, graph: DataHubGraph): + self.graph = graph + self.openapi_base = graph._gms_server.rstrip("/") + "/openapi/v3" + + def scroll_urns_by_filter( + self, + entity_type: str, + query: Union[ElasticDocumentQuery, ElasticsearchQueryBuilder], + ) -> Iterable[str]: + """ + Scroll through all urns that match the given filters. + + """ + + key_aspect = self.ENTITY_KEY_ASPECT_MAP.get(entity_type) + assert key_aspect, f"No key aspect found for entity type {entity_type}" + + count = 1000 + string_query = query.build() + scroll_id = None + logger.debug(f"Scrolling with query: {string_query}") + while True: + response = self.graph._get_generic( + self.openapi_base + f"/entity/{entity_type.lower()}", + params={ + "systemMetadata": "false", + "includeSoftDelete": "false", + "skipCache": "false", + "aspects": [key_aspect], + "scrollId": scroll_id, + "count": count, + "query": string_query, + }, + ) + entities = response.get("entities", []) + scroll_id = response.get("scrollId") + for entity in entities: + yield entity["urn"] + if not scroll_id: + break diff --git a/metadata-ingestion/src/datahub/utilities/search_utils.py b/metadata-ingestion/src/datahub/utilities/search_utils.py new file mode 100644 index 0000000000000..0bd88addd8660 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/search_utils.py @@ -0,0 +1,285 @@ +import logging +import re +from enum import Enum +from typing import Generic, List, Optional, Tuple, Type, TypeVar, Union + +logger = logging.getLogger(__name__) + + +class LogicalOperator(Enum): + AND = "AND" + OR = "OR" + + +class SearchField: + def __init__(self, field_name: str): + self.field_name = field_name + + def get_search_value(self, value: str) -> str: + return value + + def __str__(self) -> str: + return self.field_name + + def __repr__(self) -> str: + return self.__str__() + + @classmethod + def from_string_field(cls, field_name: str) -> "SearchField": + return cls(field_name) + + +class QueryNode: + def __init__(self, operator: Optional[LogicalOperator] = None): + self.operator = operator + self.children: List[Union[QueryNode, str]] = [] + + def add_child(self, child: Union["QueryNode", str]) -> None: + self.children.append(child) + + def build(self) -> str: + if not self.children: + return "" + + if self.operator is None: + return ( + self.children[0] + if isinstance(self.children[0], str) + else self.children[0].build() + ) + + child_queries = [] + for child in self.children: + if isinstance(child, str): + child_queries.append(child) + else: + child_queries.append(child.build()) + + joined_queries = f" {self.operator.value} ".join(child_queries) + return f"({joined_queries})" if len(child_queries) > 1 else joined_queries + + +class ElasticsearchQueryBuilder: + SPECIAL_CHARACTERS = r'+-=&|> None: + self.root = QueryNode(operator=operator) + + @classmethod + def escape_special_characters(cls, value: str) -> str: + """ + Escape special characters in the search term. + """ + return re.sub(f"([{re.escape(cls.SPECIAL_CHARACTERS)}])", r"\\\1", value) + + def _create_term( + self, field: SearchField, value: str, is_exact: bool = False + ) -> str: + escaped_value = self.escape_special_characters(field.get_search_value(value)) + field_name: str = field.field_name + if is_exact: + return f'{field_name}:"{escaped_value}"' + return f"{field_name}:{escaped_value}" + + def add_field_match( + self, field: SearchField, value: str, is_exact: bool = True + ) -> "ElasticsearchQueryBuilder": + term = self._create_term(field, value, is_exact) + self.root.add_child(term) + return self + + def add_field_not_match( + self, field: SearchField, value: str, is_exact: bool = True + ) -> "ElasticsearchQueryBuilder": + term = f"-{self._create_term(field, value, is_exact)}" + self.root.add_child(term) + return self + + def add_range( + self, + field: str, + min_value: Optional[str] = None, + max_value: Optional[str] = None, + include_min: bool = True, + include_max: bool = True, + ) -> "ElasticsearchQueryBuilder": + min_bracket = "[" if include_min else "{" + max_bracket = "]" if include_max else "}" + min_val = min_value if min_value is not None else "*" + max_val = max_value if max_value is not None else "*" + range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}" + self.root.add_child(range_query) + return self + + def add_wildcard(self, field: str, pattern: str) -> "ElasticsearchQueryBuilder": + wildcard_query = f"{field}:{pattern}" + self.root.add_child(wildcard_query) + return self + + def add_fuzzy( + self, field: str, value: str, fuzziness: int = 2 + ) -> "ElasticsearchQueryBuilder": + fuzzy_query = f"{field}:{value}~{fuzziness}" + self.root.add_child(fuzzy_query) + return self + + def add_boost( + self, field: str, value: str, boost: float + ) -> "ElasticsearchQueryBuilder": + boosted_query = f"{field}:{value}^{boost}" + self.root.add_child(boosted_query) + return self + + def group(self, operator: LogicalOperator) -> "QueryGroup": + return QueryGroup(self, operator) + + def build(self) -> str: + return self.root.build() + + +class QueryGroup: + def __init__(self, parent: ElasticsearchQueryBuilder, operator: LogicalOperator): + self.parent = parent + self.node = QueryNode(operator) + self.parent.root.add_child(self.node) + + def add_field_match( + self, field: Union[str, SearchField], value: str, is_exact: bool = True + ) -> "QueryGroup": + if isinstance(field, str): + field = SearchField.from_string_field(field) + term = self.parent._create_term(field, value, is_exact) + self.node.add_child(term) + return self + + def add_field_not_match( + self, field: Union[str, SearchField], value: str, is_exact: bool = True + ) -> "QueryGroup": + if isinstance(field, str): + field = SearchField.from_string_field(field) + term = f"-{self.parent._create_term(field, value, is_exact)}" + self.node.add_child(term) + return self + + def add_range( + self, + field: str, + min_value: Optional[str] = None, + max_value: Optional[str] = None, + include_min: bool = True, + include_max: bool = True, + ) -> "QueryGroup": + min_bracket = "[" if include_min else "{" + max_bracket = "]" if include_max else "}" + min_val = min_value if min_value is not None else "*" + max_val = max_value if max_value is not None else "*" + range_query = f"{field}:{min_bracket}{min_val} TO {max_val}{max_bracket}" + self.node.add_child(range_query) + return self + + def add_wildcard(self, field: str, pattern: str) -> "QueryGroup": + wildcard_query = f"{field}:{pattern}" + self.node.add_child(wildcard_query) + return self + + def add_fuzzy(self, field: str, value: str, fuzziness: int = 2) -> "QueryGroup": + fuzzy_query = f"{field}:{value}~{fuzziness}" + self.node.add_child(fuzzy_query) + return self + + def add_boost(self, field: str, value: str, boost: float) -> "QueryGroup": + boosted_query = f"{field}:{value}^{boost}" + self.node.add_child(boosted_query) + return self + + def group(self, operator: LogicalOperator) -> "QueryGroup": + new_group = QueryGroup(self.parent, operator) + self.node.add_child(new_group.node) + return new_group + + def end(self) -> ElasticsearchQueryBuilder: + return self.parent + + +SF = TypeVar("SF", bound=SearchField) + + +class ElasticDocumentQuery(Generic[SF]): + def __init__(self) -> None: + self.query_builder = ElasticsearchQueryBuilder() + + @classmethod + def create_from( + cls: Type["ElasticDocumentQuery[SF]"], + *args: Tuple[Union[str, SF], str], + ) -> "ElasticDocumentQuery[SF]": + instance = cls() + for arg in args: + if isinstance(arg, SearchField): + # If the value is empty, we treat it as a wildcard search + logger.info(f"Adding wildcard search for field {arg}") + instance.add_wildcard(arg, "*") + elif isinstance(arg, tuple) and len(arg) == 2: + field, value = arg + assert isinstance(value, str) + if isinstance(field, SearchField): + instance.add_field_match(field, value) + elif isinstance(field, str): + instance.add_field_match( + SearchField.from_string_field(field), value + ) + else: + raise ValueError("Invalid field type {}".format(type(field))) + return instance + + def add_field_match( + self, field: Union[str, SearchField], value: str, is_exact: bool = True + ) -> "ElasticDocumentQuery": + if isinstance(field, str): + field = SearchField.from_string_field(field) + self.query_builder.add_field_match(field, value, is_exact) + return self + + def add_field_not_match( + self, field: SearchField, value: str, is_exact: bool = True + ) -> "ElasticDocumentQuery": + self.query_builder.add_field_not_match(field, value, is_exact) + return self + + def add_range( + self, + field: SearchField, + min_value: Optional[str] = None, + max_value: Optional[str] = None, + include_min: bool = True, + include_max: bool = True, + ) -> "ElasticDocumentQuery": + field_name: str = field.field_name # type: ignore + self.query_builder.add_range( + field_name, min_value, max_value, include_min, include_max + ) + return self + + def add_wildcard(self, field: SearchField, pattern: str) -> "ElasticDocumentQuery": + field_name: str = field.field_name # type: ignore + self.query_builder.add_wildcard(field_name, pattern) + return self + + def add_fuzzy( + self, field: SearchField, value: str, fuzziness: int = 2 + ) -> "ElasticDocumentQuery": + field_name: str = field.field_name # type: ignore + self.query_builder.add_fuzzy(field_name, value, fuzziness) + return self + + def add_boost( + self, field: SearchField, value: str, boost: float + ) -> "ElasticDocumentQuery": + self.query_builder.add_boost(field.field_name, value, boost) + return self + + def group(self, operator: LogicalOperator) -> QueryGroup: + return self.query_builder.group(operator) + + def build(self) -> str: + return self.query_builder.build() diff --git a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py index e6c9a9466d62b..a84e373dbe72c 100644 --- a/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py +++ b/metadata-ingestion/tests/unit/api/entities/platformresource/test_platform_resource.py @@ -4,9 +4,12 @@ import datahub.metadata.schema_classes as models from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, PlatformResource, PlatformResourceKey, + PlatformResourceSearchFields, ) +from datahub.utilities.search_utils import LogicalOperator def test_platform_resource_dict(): @@ -179,3 +182,15 @@ class TestModel(BaseModel): ).encode("utf-8") assert platform_resource_info_mcp.aspect.value.schemaType == "JSON" assert platform_resource_info_mcp.aspect.value.schemaRef == TestModel.__name__ + + +def test_platform_resource_filters(): + + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.PRIMARY_KEY, "test_1") + .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, "server") + .end() + ) + assert query.build() == '(primaryKey:"test_1" AND resourceType:"server")' diff --git a/metadata-ingestion/tests/unit/utilities/test_search_utils.py b/metadata-ingestion/tests/unit/utilities/test_search_utils.py new file mode 100644 index 0000000000000..6fa2e46c7f20e --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_search_utils.py @@ -0,0 +1,71 @@ +from datahub.utilities.search_utils import ( + ElasticDocumentQuery, + LogicalOperator, + SearchField, +) + + +def test_simple_and_filters(): + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match("field1", "value1") + .add_field_match("field2", "value2") + .end() + ) + + assert query.build() == '(field1:"value1" AND field2:"value2")' + + +def test_simple_or_filters(): + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.OR) + .add_field_match("field1", "value1") + .add_field_match("field2", "value2") + .end() + ) + + assert query.build() == '(field1:"value1" OR field2:"value2")' + + # Use SearchFilter to create this query + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.OR) + .add_field_match(SearchField.from_string_field("field1"), "value1") + .add_field_match(SearchField.from_string_field("field2"), "value2") + .end() + ) + assert query.build() == '(field1:"value1" OR field2:"value2")' + + +def test_simple_field_match(): + query: ElasticDocumentQuery = ElasticDocumentQuery.create_from( + ("field1", "value1:1") + ) + assert query.build() == 'field1:"value1\\:1"' + + # Another way to create the same query + query = ElasticDocumentQuery.create_from() + query.add_field_match("field1", "value1:1") + assert query.build() == 'field1:"value1\\:1"' + + +def test_negation(): + query = ( + ElasticDocumentQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match("field1", "value1") + .add_field_not_match("field2", "value2") + .end() + ) + + assert query.build() == '(field1:"value1" AND -field2:"value2")' + + +def test_multi_arg_create_from(): + query: ElasticDocumentQuery = ElasticDocumentQuery.create_from( + ("field1", "value1"), + ("field2", "value2"), + ) + assert query.build() == '(field1:"value1" AND field2:"value2")' diff --git a/smoke-test/tests/platform_resources/test_platform_resource.py b/smoke-test/tests/platform_resources/test_platform_resource.py index 7ebfd4d6ea15b..39d15f2e8dea6 100644 --- a/smoke-test/tests/platform_resources/test_platform_resource.py +++ b/smoke-test/tests/platform_resources/test_platform_resource.py @@ -5,8 +5,10 @@ import pytest from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, PlatformResource, PlatformResourceKey, + PlatformResourceSearchFields, ) from tests.utils import wait_for_healthcheck_util, wait_for_writes_to_sync @@ -42,7 +44,12 @@ def cleanup_resources(graph_client): logger.warning(f"Failed to delete resource: {e}") # Additional cleanup for any resources that might have been missed - for resource in PlatformResource.search_by_key(graph_client, "test_"): + for resource in PlatformResource.search_by_filters( + graph_client, + ElasticPlatformResourceQuery.create_from().add_wildcard( + PlatformResourceSearchFields.PRIMARY_KEY, "test_*" + ), + ): try: resource.delete(graph_client) except Exception as e: @@ -114,7 +121,7 @@ def test_platform_resource_non_existent(graph_client, test_id): assert platform_resource is None -def test_platform_resource_urn_secondary_key(graph_client, test_id): +def test_platform_resource_urn_secondary_key(graph_client, test_id, cleanup_resources): key = PlatformResourceKey( platform=f"test_platform_{test_id}", resource_type=f"test_resource_type_{test_id}", @@ -129,6 +136,7 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id): secondary_keys=[dataset_urn], ) platform_resource.to_datahub(graph_client) + cleanup_resources.append(platform_resource) wait_for_writes_to_sync() read_platform_resources = [ @@ -141,7 +149,9 @@ def test_platform_resource_urn_secondary_key(graph_client, test_id): assert read_platform_resources[0] == platform_resource -def test_platform_resource_listing_by_resource_type(graph_client, test_id): +def test_platform_resource_listing_by_resource_type( + graph_client, test_id, cleanup_resources +): # Generate two resources with the same resource type key1 = PlatformResourceKey( platform=f"test_platform_{test_id}", @@ -171,13 +181,9 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id): r for r in PlatformResource.search_by_filters( graph_client, - and_filters=[ - { - "field": "resourceType", - "condition": "EQUAL", - "value": key1.resource_type, - } - ], + query=ElasticPlatformResourceQuery.create_from( + (PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type) + ), ) ] assert len(search_results) == 2 @@ -186,3 +192,55 @@ def test_platform_resource_listing_by_resource_type(graph_client, test_id): read_platform_resource_2 = next(r for r in search_results if r.id == key2.id) assert read_platform_resource_1 == platform_resource1 assert read_platform_resource_2 == platform_resource2 + + +def test_platform_resource_listing_complex_queries(graph_client, test_id): + # Generate two resources with the same resource type + key1 = PlatformResourceKey( + platform=f"test_platform1_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_1_{test_id}", + ) + platform_resource1 = PlatformResource.create( + key=key1, + value={"test_key": f"test_value_1_{test_id}"}, + ) + platform_resource1.to_datahub(graph_client) + + key2 = PlatformResourceKey( + platform=f"test_platform2_{test_id}", + resource_type=f"test_resource_type_{test_id}", + primary_key=f"test_primary_key_2_{test_id}", + ) + platform_resource2 = PlatformResource.create( + key=key2, + value={"test_key": f"test_value_2_{test_id}"}, + ) + platform_resource2.to_datahub(graph_client) + + wait_for_writes_to_sync() + from datahub.api.entities.platformresource.platform_resource import ( + ElasticPlatformResourceQuery, + LogicalOperator, + PlatformResourceSearchFields, + ) + + query = ( + ElasticPlatformResourceQuery.create_from() + .group(LogicalOperator.AND) + .add_field_match(PlatformResourceSearchFields.RESOURCE_TYPE, key1.resource_type) + .add_field_not_match(PlatformResourceSearchFields.PLATFORM, key1.platform) + .end() + ) + + search_results = [ + r + for r in PlatformResource.search_by_filters( + graph_client, + query=query, + ) + ] + assert len(search_results) == 1 + + read_platform_resource = search_results[0] + assert read_platform_resource == platform_resource2 From 2f85bc1ecbd025d9ce0c6d3da644b3f83656755c Mon Sep 17 00:00:00 2001 From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com> Date: Sun, 20 Oct 2024 06:58:32 +0530 Subject: [PATCH 15/17] fix(docs): fix businessattributes doc (#11653) --- docs/businessattributes.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/businessattributes.md b/docs/businessattributes.md index 1744f48f879e8..3e912e7e60980 100644 --- a/docs/businessattributes.md +++ b/docs/businessattributes.md @@ -28,7 +28,6 @@ Taking the example of "United States- Social Security Number", if an application What you need to create/update and associate business attributes to dataset schema field * **Manage Business Attributes** platform privilege to create/update/delete business attributes. -* **Edit Dataset Column Business Attribute** metadata privilege to associate business attributes to dataset schema field. ## Using Business Attributes As of now Business Attributes can only be created through UI From dd1a06fb558a2177ea02437ea0aa7172c2ccd781 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Sun, 20 Oct 2024 23:59:45 -0700 Subject: [PATCH 16/17] feat(ingest/fivetran): add safeguards on table/column lineage (#11674) --- .../ingestion/source/fivetran/config.py | 19 +-- .../ingestion/source/fivetran/data_classes.py | 2 +- .../ingestion/source/fivetran/fivetran.py | 23 ++- .../source/fivetran/fivetran_log_api.py | 86 +++++------ .../source/fivetran/fivetran_query.py | 143 +++++++++++------- .../integration/fivetran/test_fivetran.py | 6 +- 6 files changed, 156 insertions(+), 123 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py index 02eb096b240f5..2fb5ffd16ea34 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -1,6 +1,6 @@ +import dataclasses import logging -from dataclasses import dataclass, field as dataclass_field -from typing import Dict, List, Optional +from typing import Dict, Optional import pydantic from pydantic import Field, root_validator @@ -23,6 +23,7 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionConfigBase, ) +from datahub.utilities.lossy_collections import LossyList from datahub.utilities.perf_timer import PerfTimer logger = logging.getLogger(__name__) @@ -114,24 +115,24 @@ def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict: return values -@dataclass +@dataclasses.dataclass class MetadataExtractionPerfReport(Report): - connectors_metadata_extraction_sec: PerfTimer = dataclass_field( + connectors_metadata_extraction_sec: PerfTimer = dataclasses.field( default_factory=PerfTimer ) - connectors_lineage_extraction_sec: PerfTimer = dataclass_field( + connectors_lineage_extraction_sec: PerfTimer = dataclasses.field( default_factory=PerfTimer ) - connectors_jobs_extraction_sec: PerfTimer = dataclass_field( + connectors_jobs_extraction_sec: PerfTimer = dataclasses.field( default_factory=PerfTimer ) -@dataclass +@dataclasses.dataclass class FivetranSourceReport(StaleEntityRemovalSourceReport): connectors_scanned: int = 0 - filtered_connectors: List[str] = dataclass_field(default_factory=list) - metadata_extraction_perf: MetadataExtractionPerfReport = dataclass_field( + filtered_connectors: LossyList[str] = dataclasses.field(default_factory=LossyList) + metadata_extraction_perf: MetadataExtractionPerfReport = dataclasses.field( default_factory=MetadataExtractionPerfReport ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py index 18de2b01edd3b..046aa9efe3f59 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py @@ -24,7 +24,7 @@ class Connector: sync_frequency: int destination_id: str user_id: str - table_lineage: List[TableLineage] + lineage: List[TableLineage] jobs: List["Job"] diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py index 334bb58ea84f8..c27ec57c2e99e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -27,9 +27,10 @@ PlatformDetail, ) from datahub.ingestion.source.fivetran.data_classes import Connector, Job -from datahub.ingestion.source.fivetran.fivetran_log_api import ( +from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI +from datahub.ingestion.source.fivetran.fivetran_query import ( MAX_JOBS_PER_CONNECTOR, - FivetranLogAPI, + MAX_TABLE_LINEAGE_PER_CONNECTOR, ) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, @@ -106,13 +107,21 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity." ) - for table_lineage in connector.table_lineage: + if len(connector.lineage) >= MAX_TABLE_LINEAGE_PER_CONNECTOR: + self.report.warning( + title="Table lineage truncated", + message=f"The connector had more than {MAX_TABLE_LINEAGE_PER_CONNECTOR} table lineage entries. " + f"Only the most recent {MAX_TABLE_LINEAGE_PER_CONNECTOR} entries were ingested.", + context=f"{connector.connector_name} (connector_id: {connector.connector_id})", + ) + + for lineage in connector.lineage: input_dataset_urn = DatasetUrn.create_from_ids( platform_id=source_platform, table_name=( - f"{source_database.lower()}.{table_lineage.source_table}" + f"{source_database.lower()}.{lineage.source_table}" if source_database - else table_lineage.source_table + else lineage.source_table ), env=source_platform_detail.env, platform_instance=source_platform_detail.platform_instance, @@ -121,14 +130,14 @@ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: output_dataset_urn = DatasetUrn.create_from_ids( platform_id=self.config.fivetran_log_config.destination_platform, - table_name=f"{self.audit_log.fivetran_log_database.lower()}.{table_lineage.destination_table}", + table_name=f"{self.audit_log.fivetran_log_database.lower()}.{lineage.destination_table}", env=destination_platform_detail.env, platform_instance=destination_platform_detail.platform_instance, ) output_dataset_urn_list.append(output_dataset_urn) if self.config.include_column_lineage: - for column_lineage in table_lineage.column_lineage: + for column_lineage in lineage.column_lineage: fine_grained_lineage.append( FineGrainedLineage( upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py index 5908efe39e2b4..b55c8bbbd607f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py @@ -1,6 +1,7 @@ import functools import json import logging +from collections import defaultdict from typing import Any, Dict, List, Optional, Tuple import sqlglot @@ -22,10 +23,6 @@ logger: logging.Logger = logging.getLogger(__name__) -# We don't want to generate a massive number of dataProcesses for a single connector. -# This is primarily used as a safeguard to prevent performance issues. -MAX_JOBS_PER_CONNECTOR = 1000 - class FivetranLogAPI: def __init__(self, fivetran_log_config: FivetranLogConfig) -> None: @@ -91,55 +88,51 @@ def _query(self, query: str) -> List[Dict]: resp = self.engine.execute(query) return [row for row in resp] - def _get_column_lineage_metadata(self) -> Dict[str, List]: + def _get_column_lineage_metadata(self) -> Dict[Tuple[str, str], List]: """ - Return's dict of column lineage metadata with key as '-' + Returns dict of column lineage metadata with key as (, ) """ - all_column_lineage: Dict[str, List] = {} + all_column_lineage = defaultdict(list) column_lineage_result = self._query( self.fivetran_log_query.get_column_lineage_query() ) for column_lineage in column_lineage_result: - key = f"{column_lineage[Constant.SOURCE_TABLE_ID]}-{column_lineage[Constant.DESTINATION_TABLE_ID]}" - if key not in all_column_lineage: - all_column_lineage[key] = [column_lineage] - else: - all_column_lineage[key].append(column_lineage) - return all_column_lineage + key = ( + column_lineage[Constant.SOURCE_TABLE_ID], + column_lineage[Constant.DESTINATION_TABLE_ID], + ) + all_column_lineage[key].append(column_lineage) + return dict(all_column_lineage) - def _get_connectors_table_lineage_metadata(self) -> Dict[str, List]: + def _get_table_lineage_metadata(self) -> Dict[str, List]: """ - Return's dict of table lineage metadata with key as 'CONNECTOR_ID' + Returns dict of table lineage metadata with key as 'CONNECTOR_ID' """ - connectors_table_lineage_metadata: Dict[str, List] = {} + connectors_table_lineage_metadata = defaultdict(list) table_lineage_result = self._query( self.fivetran_log_query.get_table_lineage_query() ) for table_lineage in table_lineage_result: - if ( + connectors_table_lineage_metadata[ table_lineage[Constant.CONNECTOR_ID] - not in connectors_table_lineage_metadata - ): - connectors_table_lineage_metadata[ - table_lineage[Constant.CONNECTOR_ID] - ] = [table_lineage] - else: - connectors_table_lineage_metadata[ - table_lineage[Constant.CONNECTOR_ID] - ].append(table_lineage) - return connectors_table_lineage_metadata + ].append(table_lineage) + return dict(connectors_table_lineage_metadata) - def _get_table_lineage( + def _extract_connector_lineage( self, - column_lineage_metadata: Dict[str, List], table_lineage_result: Optional[List], + column_lineage_metadata: Dict[Tuple[str, str], List], ) -> List[TableLineage]: table_lineage_list: List[TableLineage] = [] if table_lineage_result is None: return table_lineage_list for table_lineage in table_lineage_result: + # Join the column lineage into the table lineage. column_lineage_result = column_lineage_metadata.get( - f"{table_lineage[Constant.SOURCE_TABLE_ID]}-{table_lineage[Constant.DESTINATION_TABLE_ID]}" + ( + table_lineage[Constant.SOURCE_TABLE_ID], + table_lineage[Constant.DESTINATION_TABLE_ID], + ) ) column_lineage_list: List[ColumnLineage] = [] if column_lineage_result: @@ -152,6 +145,7 @@ def _get_table_lineage( ) for column_lineage in column_lineage_result ] + table_lineage_list.append( TableLineage( source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}", @@ -167,14 +161,9 @@ def _get_all_connector_sync_logs( ) -> Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]]: sync_logs: Dict[str, Dict[str, Dict[str, Tuple[float, Optional[str]]]]] = {} - # Format connector_ids as a comma-separated string of quoted IDs - formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids) - - query = self.fivetran_log_query.get_sync_logs_query().format( - db_clause=self.fivetran_log_query.db_clause, + query = self.fivetran_log_query.get_sync_logs_query( syncs_interval=syncs_interval, - max_jobs_per_connector=MAX_JOBS_PER_CONNECTOR, - connector_ids=formatted_connector_ids, + connector_ids=connector_ids, ) for row in self._query(query): @@ -234,13 +223,13 @@ def get_user_email(self, user_id: str) -> Optional[str]: return None return self._get_users().get(user_id) - def _fill_connectors_table_lineage(self, connectors: List[Connector]) -> None: - table_lineage_metadata = self._get_connectors_table_lineage_metadata() + def _fill_connectors_lineage(self, connectors: List[Connector]) -> None: + table_lineage_metadata = self._get_table_lineage_metadata() column_lineage_metadata = self._get_column_lineage_metadata() for connector in connectors: - connector.table_lineage = self._get_table_lineage( - column_lineage_metadata=column_lineage_metadata, + connector.lineage = self._extract_connector_lineage( table_lineage_result=table_lineage_metadata.get(connector.connector_id), + column_lineage_metadata=column_lineage_metadata, ) def _fill_connectors_jobs( @@ -262,6 +251,7 @@ def get_allowed_connectors_list( ) -> List[Connector]: connectors: List[Connector] = [] with report.metadata_extraction_perf.connectors_metadata_extraction_sec: + logger.info("Fetching connector list") connector_list = self._query(self.fivetran_log_query.get_connectors_query()) for connector in connector_list: if not connector_patterns.allowed(connector[Constant.CONNECTOR_NAME]): @@ -279,12 +269,20 @@ def get_allowed_connectors_list( sync_frequency=connector[Constant.SYNC_FREQUENCY], destination_id=connector[Constant.DESTINATION_ID], user_id=connector[Constant.CONNECTING_USER_ID], - table_lineage=[], - jobs=[], + lineage=[], # filled later + jobs=[], # filled later ) ) + + if not connectors: + # Some of our queries don't work well when there's no connectors, since + # we push down connector id filters. + return [] + with report.metadata_extraction_perf.connectors_lineage_extraction_sec: - self._fill_connectors_table_lineage(connectors) + logger.info("Fetching connector lineage") + self._fill_connectors_lineage(connectors) with report.metadata_extraction_perf.connectors_jobs_extraction_sec: + logger.info("Fetching connector job run history") self._fill_connectors_jobs(connectors, syncs_interval) return connectors diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py index c4680b4b1037a..c9e329b706768 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py @@ -1,3 +1,11 @@ +from typing import List + +# Safeguards to prevent fetching massive amounts of data. +MAX_TABLE_LINEAGE_PER_CONNECTOR = 100 +MAX_COLUMN_LINEAGE_PER_CONNECTOR = 3000 +MAX_JOBS_PER_CONNECTOR = 1000 + + class FivetranLogQuery: # Note: All queries are written in Snowflake SQL. # They will be transpiled to the target database's SQL dialect at runtime. @@ -24,69 +32,88 @@ def get_connectors_query(self) -> str: destination_id FROM {self.db_clause}connector WHERE - _fivetran_deleted = FALSE\ + _fivetran_deleted = FALSE """ def get_users_query(self) -> str: - return f""" - SELECT id as user_id, - given_name, - family_name, - email - FROM {self.db_clause}user""" + return f"""\ +SELECT id as user_id, +given_name, +family_name, +email +FROM {self.db_clause}user +""" - def get_sync_logs_query(self) -> str: - return """ - WITH ranked_syncs AS ( - SELECT - connector_id, - sync_id, - MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time, - MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, - MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, - ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn - FROM {db_clause}log - WHERE message_event in ('sync_start', 'sync_end') - AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' - AND connector_id IN ({connector_ids}) - GROUP BY connector_id, sync_id - ) - SELECT - connector_id, - sync_id, - start_time, - end_time, - end_message_data - FROM ranked_syncs - WHERE rn <= {max_jobs_per_connector} - AND start_time IS NOT NULL - AND end_time IS NOT NULL - ORDER BY connector_id, end_time DESC - """ + def get_sync_logs_query( + self, + syncs_interval: int, + connector_ids: List[str], + ) -> str: + # Format connector_ids as a comma-separated string of quoted IDs + formatted_connector_ids = ", ".join(f"'{id}'" for id in connector_ids) + + return f"""\ +WITH ranked_syncs AS ( + SELECT + connector_id, + sync_id, + MAX(CASE WHEN message_event = 'sync_start' THEN time_stamp END) as start_time, + MAX(CASE WHEN message_event = 'sync_end' THEN time_stamp END) as end_time, + MAX(CASE WHEN message_event = 'sync_end' THEN message_data END) as end_message_data, + ROW_NUMBER() OVER (PARTITION BY connector_id ORDER BY MAX(time_stamp) DESC) as rn + FROM {self.db_clause}log + WHERE message_event in ('sync_start', 'sync_end') + AND time_stamp > CURRENT_TIMESTAMP - INTERVAL '{syncs_interval} days' + AND connector_id IN ({formatted_connector_ids}) + GROUP BY connector_id, sync_id +) +SELECT + connector_id, + sync_id, + start_time, + end_time, + end_message_data +FROM ranked_syncs +WHERE rn <= {MAX_JOBS_PER_CONNECTOR} + AND start_time IS NOT NULL + AND end_time IS NOT NULL +ORDER BY connector_id, end_time DESC +""" def get_table_lineage_query(self) -> str: - return f""" - SELECT stm.connector_id as connector_id, - stm.id as source_table_id, - stm.name as source_table_name, - ssm.name as source_schema_name, - dtm.id as destination_table_id, - dtm.name as destination_table_name, - dsm.name as destination_schema_name - FROM {self.db_clause}table_lineage as tl - JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id - JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id - JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id - JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id""" + return f"""\ +SELECT + stm.connector_id as connector_id, + stm.id as source_table_id, + stm.name as source_table_name, + ssm.name as source_schema_name, + dtm.id as destination_table_id, + dtm.name as destination_table_name, + dsm.name as destination_schema_name +FROM {self.db_clause}table_lineage as tl +JOIN {self.db_clause}source_table_metadata as stm on tl.source_table_id = stm.id +JOIN {self.db_clause}destination_table_metadata as dtm on tl.destination_table_id = dtm.id +JOIN {self.db_clause}source_schema_metadata as ssm on stm.schema_id = ssm.id +JOIN {self.db_clause}destination_schema_metadata as dsm on dtm.schema_id = dsm.id +QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY tl.created_at DESC) <= {MAX_TABLE_LINEAGE_PER_CONNECTOR} +ORDER BY stm.connector_id, tl.created_at DESC +""" def get_column_lineage_query(self) -> str: - return f""" - SELECT scm.table_id as source_table_id, - dcm.table_id as destination_table_id, - scm.name as source_column_name, - dcm.name as destination_column_name - FROM {self.db_clause}column_lineage as cl - JOIN {self.db_clause}source_column_metadata as scm - on cl.source_column_id = scm.id - JOIN {self.db_clause}destination_column_metadata as dcm - on cl.destination_column_id = dcm.id""" + return f"""\ +SELECT + scm.table_id as source_table_id, + dcm.table_id as destination_table_id, + scm.name as source_column_name, + dcm.name as destination_column_name +FROM {self.db_clause}column_lineage as cl +JOIN {self.db_clause}source_column_metadata as scm + ON cl.source_column_id = scm.id +JOIN {self.db_clause}destination_column_metadata as dcm + ON cl.destination_column_id = dcm.id +-- Only joining source_table_metadata to get the connector_id. +JOIN {self.db_clause}source_table_metadata as stm + ON scm.table_id = stm.id +QUALIFY ROW_NUMBER() OVER (PARTITION BY stm.connector_id ORDER BY cl.created_at DESC) <= {MAX_COLUMN_LINEAGE_PER_CONNECTOR} +ORDER BY stm.connector_id, cl.created_at DESC +""" diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py index 33ac09e69a3c0..e72162b12e48f 100644 --- a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -100,11 +100,9 @@ def default_query_results( "email": "abc.xyz@email.com", } ] - elif query == fivetran_log_query.get_sync_logs_query().format( - db_clause=fivetran_log_query.db_clause, + elif query == fivetran_log_query.get_sync_logs_query( syncs_interval=7, - max_jobs_per_connector=1000, - connector_ids="'calendar_elected'", + connector_ids=["calendar_elected"], ): return [ { From 554288bb05354dae16471381daec3549658bdda3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20L=C3=BCdin?= <13187726+Masterchen09@users.noreply.github.com> Date: Mon, 21 Oct 2024 09:00:09 +0200 Subject: [PATCH 17/17] fix(ui): show DataHub logo for DataHub sources in ingestion souces list (#11658) Co-authored-by: Shirshanka Das --- .../src/app/ingest/source/builder/constants.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index b67ca388c1054..0e0ba8b22e37e 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -35,6 +35,7 @@ import csvLogo from '../../../../images/csv-logo.png'; import qlikLogo from '../../../../images/qliklogo.png'; import sigmaLogo from '../../../../images/sigmalogo.png'; import sacLogo from '../../../../images/saclogo.svg'; +import datahubLogo from '../../../../images/datahublogo.png'; export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; @@ -125,6 +126,11 @@ export const SIGMA = 'sigma'; export const SIGMA_URN = `urn:li:dataPlatform:${SIGMA}`; export const SAC = 'sac'; export const SAC_URN = `urn:li:dataPlatform:${SAC}`; +export const DATAHUB = 'datahub'; +export const DATAHUB_GC = 'datahub-gc'; +export const DATAHUB_LINEAGE_FILE = 'datahub-lineage-file'; +export const DATAHUB_BUSINESS_GLOSSARY = 'datahub-business-glossary'; +export const DATAHUB_URN = `urn:li:dataPlatform:${DATAHUB}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, @@ -165,6 +171,7 @@ export const PLATFORM_URN_TO_LOGO = { [QLIK_SENSE_URN]: qlikLogo, [SIGMA_URN]: sigmaLogo, [SAC_URN]: sacLogo, + [DATAHUB_URN]: datahubLogo, }; export const SOURCE_TO_PLATFORM_URN = { @@ -178,5 +185,7 @@ export const SOURCE_TO_PLATFORM_URN = { [SNOWFLAKE_USAGE]: SNOWFLAKE_URN, [STARBURST_TRINO_USAGE]: TRINO_URN, [DBT_CLOUD]: DBT_URN, - [VERTICA]: VERTICA_URN, + [DATAHUB_GC]: DATAHUB_URN, + [DATAHUB_LINEAGE_FILE]: DATAHUB_URN, + [DATAHUB_BUSINESS_GLOSSARY]: DATAHUB_URN, };