From 8967db0de7994e2b103fc15b022778dc64c18b05 Mon Sep 17 00:00:00 2001 From: John Joyce Date: Mon, 15 Jul 2024 11:57:21 -0700 Subject: [PATCH 01/18] refactor(redshift): Improve redshift error handling with new structured reporting system (#10870) Co-authored-by: John Joyce Co-authored-by: Harshal Sheth --- .../ingestion/source/ge_data_profiler.py | 18 ++++- .../ingestion/source/redshift/exception.py | 65 +++++++++++++++++ .../ingestion/source/redshift/lineage_v2.py | 13 +++- .../ingestion/source/redshift/profile.py | 29 +++++--- .../ingestion/source/redshift/query.py | 1 + .../ingestion/source/redshift/redshift.py | 73 +++++++++++++++++-- .../source/redshift/redshift_schema.py | 45 ++++++++---- .../source/sql/sql_generic_profiler.py | 29 +++++--- 8 files changed, 231 insertions(+), 42 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 3173dfa3023999..8843a0ad8eae65 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -1216,8 +1216,22 @@ def _generate_single_profile( except Exception as e: if not self.config.catch_exceptions: raise e - logger.exception(f"Encountered exception while profiling {pretty_name}") - self.report.report_warning(pretty_name, f"Profiling exception {e}") + + error_message = str(e).lower() + if "permission denied" in error_message: + self.report.warning( + title="Unauthorized to extract data profile statistics", + message="We were denied access while attempting to generate profiling statistics for some assets. Please ensure the provided user has permission to query these tables and views.", + context=f"Asset: {pretty_name}", + exc=e, + ) + else: + self.report.warning( + title="Failed to extract statistics for some assets", + message="Caught unexpected exception while attempting to extract profiling statistics for some assets.", + context=f"Asset: {pretty_name}", + exc=e, + ) return None finally: if batch is not None and self.base_engine.engine.name == TRINO: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py new file mode 100644 index 00000000000000..43ad5bfcefdf1b --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/exception.py @@ -0,0 +1,65 @@ +from typing import Callable, Iterable, TypeVar, Union + +import redshift_connector +from typing_extensions import ParamSpec + +from datahub.ingestion.source.redshift.report import RedshiftReport + +T = TypeVar("T") +P = ParamSpec("P") + + +def handle_redshift_exceptions( + report: RedshiftReport, + func: Callable[P, T], + *args: P.args, + **kwargs: P.kwargs, +) -> Union[T, None]: + try: + return func(*args, **kwargs) + except redshift_connector.Error as e: + report_redshift_failure(report, e) + return None + + +def handle_redshift_exceptions_yield( + report: RedshiftReport, + func: Callable[P, Iterable[T]], + *args: P.args, + **kwargs: P.kwargs, +) -> Iterable[T]: + try: + yield from func(*args, **kwargs) + except redshift_connector.Error as e: + report_redshift_failure(report, e) + + +def report_redshift_failure( + report: RedshiftReport, e: redshift_connector.Error +) -> None: + error_message = str(e).lower() + if "permission denied" in error_message: + if "svv_table_info" in error_message: + report.report_failure( + title="Permission denied", + message="Failed to extract metadata due to insufficient permission to access 'svv_table_info' table. Please ensure the provided database user has access.", + exc=e, + ) + elif "svl_user_info" in error_message: + report.report_failure( + title="Permission denied", + message="Failed to extract metadata due to insufficient permission to access 'svl_user_info' table. Please ensure the provided database user has access.", + exc=e, + ) + else: + report.report_failure( + title="Permission denied", + message="Failed to extract metadata due to insufficient permissions.", + exc=e, + ) + else: + report.report_failure( + title="Failed to extract some metadata", + message="Failed to extract some metadata from Redshift.", + exc=e, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py index 526e5e2cf12d02..3d6c746183fd92 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage_v2.py @@ -1,6 +1,5 @@ import collections import logging -import traceback from typing import Callable, Dict, Iterable, List, Optional, Set, Tuple, Union import redshift_connector @@ -249,8 +248,10 @@ def _populate_lineage_agg( processor(lineage_row) except Exception as e: self.report.warning( - f"lineage-v2-extract-{lineage_type.name}", - f"Error was {e}, {traceback.format_exc()}", + title="Failed to extract some lineage", + message=f"Failed to extract lineage of type {lineage_type.name}", + context=f"Query: '{query}'", + exc=e, ) self._lineage_v1.report_status(f"extract-{lineage_type.name}", False) @@ -417,3 +418,9 @@ def _process_external_tables( def generate(self) -> Iterable[MetadataWorkUnit]: for mcp in self.aggregator.gen_metadata(): yield mcp.as_workunit() + if len(self.aggregator.report.observed_query_parse_failures) > 0: + self.report.report_failure( + title="Failed to extract some SQL lineage", + message="Unexpected error(s) while attempting to extract lineage from SQL queries. See the full logs for more details.", + context=f"Query Parsing Failures: {self.aggregator.report.observed_query_parse_failures}", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py index eed82ec4d83e76..6f611fa6741879 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py @@ -48,15 +48,26 @@ def get_workunits( if not self.config.schema_pattern.allowed(schema): continue for table in tables[db].get(schema, {}): - if ( - not self.config.profiling.profile_external_tables - and table.type == "EXTERNAL_TABLE" - ): - self.report.profiling_skipped_other[schema] += 1 - logger.info( - f"Skipping profiling of external table {db}.{schema}.{table.name}" - ) - continue + if table.type == "EXTERNAL_TABLE": + if not self.config.profiling.profile_external_tables: + # Case 1: If user did not tell us to profile external tables, simply log this. + self.report.profiling_skipped_other[schema] += 1 + logger.info( + f"Skipping profiling of external table {db}.{schema}.{table.name}" + ) + # Continue, since we should not profile this table. + continue + elif self.config.profiling.profile_table_level_only: + # Case 2: User DID tell us to profile external tables, but only at the table level. + # Currently, we do not support this combination. The user needs to also set + # profile_table_level_only to False in order to profile. + self.report.report_warning( + title="Skipped profiling for external tables", + message="External tables are not supported for profiling when 'profile_table_level_only' config is set to 'True'. Please set 'profile_table_level_only' to 'False' in order to profile external Redshift tables.", + context=f"External Table: {db}.{schema}.{table.name}", + ) + # Continue, since we were unable to retrieve cheap profiling stats from svv_table_info. + continue # Emit the profile work unit profile_request = self.get_profile_request(table, schema, db) if profile_request is not None: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py index 3bd69d72be6050..affbcd00b5107b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/query.py @@ -122,6 +122,7 @@ def list_tables( else: return f"{tables_query} UNION {external_tables_query}" + # Why is this unused. Is this a bug? list_columns: str = """ SELECT n.nspname as "schema", diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index 68821662762b63..a6ffed65aaa70c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -45,6 +45,7 @@ DatasetSubTypes, ) from datahub.ingestion.source.redshift.config import RedshiftConfig +from datahub.ingestion.source.redshift.exception import handle_redshift_exceptions_yield from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor from datahub.ingestion.source.redshift.lineage_v2 import RedshiftSqlLineageV2 from datahub.ingestion.source.redshift.profile import RedshiftProfiler @@ -411,7 +412,12 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ] def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: - connection = RedshiftSource.get_redshift_connection(self.config) + connection = self._try_get_redshift_connection(self.config) + + if connection is None: + # If we failed to establish a connection, short circuit the connector. + return + database = self.config.database logger.info(f"Processing db {database}") self.report.report_ingestion_stage_start(METADATA_EXTRACTION) @@ -419,9 +425,20 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit self.db_views[database] = defaultdict() self.db_schemas.setdefault(database, {}) + # TODO: Ideally, we'd push down exception handling to the place where the connection is used, as opposed to keeping + # this fallback. For now, this gets us broad coverage quickly. + yield from handle_redshift_exceptions_yield( + self.report, self._extract_metadata, connection, database + ) + + def _extract_metadata( + self, connection: redshift_connector.Connection, database: str + ) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + yield from self.gen_database_container( database=database, ) + self.cache_tables_and_views(connection, database) self.report.tables_in_mem_size[database] = humanfriendly.format_size( @@ -556,6 +573,7 @@ def process_schema( ): for table in self.db_tables[schema.database][schema.name]: table.columns = schema_columns[schema.name].get(table.name, []) + table.column_count = len(table.columns) table_wu_generator = self._process_table( table, database=database ) @@ -575,8 +593,10 @@ def process_schema( f"Table processed: {schema.database}.{schema.name}.{table.name}" ) else: - logger.info( - f"No tables in cache for {schema.database}.{schema.name}, skipping" + self.report.info( + title="No tables found in some schemas", + message="No tables found in some schemas. This may be due to insufficient privileges for the provided user.", + context=f"Schema: {schema.database}.{schema.name}", ) else: logger.info("Table processing disabled, skipping") @@ -589,6 +609,7 @@ def process_schema( ): for view in self.db_views[schema.database][schema.name]: view.columns = schema_columns[schema.name].get(view.name, []) + view.column_count = len(view.columns) yield from self._process_view( table=view, database=database, schema=schema ) @@ -603,8 +624,10 @@ def process_schema( f"Table processed: {schema.database}.{schema.name}.{view.name}" ) else: - logger.info( - f"No views in cache for {schema.database}.{schema.name}, skipping" + self.report.info( + title="No views found in some schemas", + message="No views found in some schemas. This may be due to insufficient privileges for the provided user.", + context=f"Schema: {schema.database}.{schema.name}", ) else: logger.info("View processing disabled, skipping") @@ -1088,3 +1111,43 @@ def add_config_to_report(self): self.config.start_time, self.config.end_time, ) + + def _try_get_redshift_connection( + self, + config: RedshiftConfig, + ) -> Optional[redshift_connector.Connection]: + try: + return RedshiftSource.get_redshift_connection(config) + except redshift_connector.Error as e: + error_message = str(e).lower() + if "password authentication failed" in error_message: + self.report.report_failure( + title="Invalid credentials", + message="Failed to connect to Redshift. Please verify your username, password, and database.", + exc=e, + ) + elif "timeout" in error_message: + self.report.report_failure( + title="Unable to connect", + message="Failed to connect to Redshift. Please verify your host name and port number.", + exc=e, + ) + elif "communication error" in error_message: + self.report.report_failure( + title="Unable to connect", + message="Failed to connect to Redshift. Please verify that the host name is valid and reachable.", + exc=e, + ) + elif "database" in error_message and "does not exist" in error_message: + self.report.report_failure( + title="Database does not exist", + message="Failed to connect to Redshift. Please verify that the provided database exists and the provided user has access to it.", + exc=e, + ) + else: + self.report.report_failure( + title="Unable to connect", + message="Failed to connect to Redshift. Please verify your connection details.", + exc=e, + ) + return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py index 101146563e8e74..6e88a50f898a5d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift_schema.py @@ -169,6 +169,8 @@ def enrich_tables( self, conn: redshift_connector.Connection, ) -> Dict[str, Dict[str, RedshiftExtraTableMeta]]: + # Warning: This table enrichment will not return anything for + # external tables (spectrum) and for tables that have never been queried / written to. cur = RedshiftDataDictionary.get_query_result( conn, self.queries.additional_table_metadata_query() ) @@ -207,7 +209,7 @@ def get_tables_and_views( # This query needs to run separately as we can't join with the main query because it works with # driver only functions. - enriched_table = self.enrich_tables(conn) + enriched_tables = self.enrich_tables(conn) cur = RedshiftDataDictionary.get_query_result( conn, @@ -216,6 +218,7 @@ def get_tables_and_views( field_names = [i[0] for i in cur.description] db_tables = cur.fetchall() logger.info(f"Fetched {len(db_tables)} tables/views from Redshift") + for table in db_tables: schema = table[field_names.index("schema")] table_name = table[field_names.index("relname")] @@ -233,7 +236,7 @@ def get_tables_and_views( rows_count, size_in_bytes, ) = RedshiftDataDictionary.get_table_stats( - enriched_table, field_names, schema, table + enriched_tables, field_names, schema, table ) tables[schema].append( @@ -263,15 +266,15 @@ def get_tables_and_views( rows_count, size_in_bytes, ) = RedshiftDataDictionary.get_table_stats( - enriched_table=enriched_table, + enriched_tables=enriched_tables, field_names=field_names, schema=schema, table=table, ) materialized = False - if schema in enriched_table and table_name in enriched_table[schema]: - if enriched_table[schema][table_name].is_materialized: + if schema in enriched_tables and table_name in enriched_tables[schema]: + if enriched_tables[schema][table_name].is_materialized: materialized = True views[schema].append( @@ -298,7 +301,7 @@ def get_tables_and_views( return tables, views @staticmethod - def get_table_stats(enriched_table, field_names, schema, table): + def get_table_stats(enriched_tables, field_names, schema, table): table_name = table[field_names.index("relname")] creation_time: Optional[datetime] = None @@ -309,25 +312,41 @@ def get_table_stats(enriched_table, field_names, schema, table): last_altered: Optional[datetime] = None size_in_bytes: Optional[int] = None rows_count: Optional[int] = None - if schema in enriched_table and table_name in enriched_table[schema]: - if enriched_table[schema][table_name].last_accessed: + if schema in enriched_tables and table_name in enriched_tables[schema]: + if enriched_tables[schema][table_name].last_accessed is not None: # Mypy seems to be not clever enough to understand the above check - last_accessed = enriched_table[schema][table_name].last_accessed + last_accessed = enriched_tables[schema][table_name].last_accessed assert last_accessed last_altered = last_accessed.replace(tzinfo=timezone.utc) elif creation_time: last_altered = creation_time - if enriched_table[schema][table_name].size: + if enriched_tables[schema][table_name].size is not None: # Mypy seems to be not clever enough to understand the above check - size = enriched_table[schema][table_name].size + size = enriched_tables[schema][table_name].size if size: size_in_bytes = size * 1024 * 1024 - if enriched_table[schema][table_name].estimated_visible_rows: - rows = enriched_table[schema][table_name].estimated_visible_rows + if enriched_tables[schema][table_name].estimated_visible_rows is not None: + rows = enriched_tables[schema][table_name].estimated_visible_rows assert rows rows_count = int(rows) + else: + # The object was not found in the enriched data. + # + # If we don't have enriched data, it may be either because: + # 1 The table is empty (as per https://docs.aws.amazon.com/redshift/latest/dg/r_SVV_TABLE_INFO.html) empty tables are omitted from svv_table_info. + # 2. The table is external + # 3. The table is a view (non-materialized) + # + # In case 1, we want to report an accurate profile suggesting that the table is empty. + # In case 2, do nothing since we cannot cheaply profile + # In case 3, do nothing since we cannot cheaply profile + if table[field_names.index("tabletype")] == "TABLE": + rows_count = 0 + size_in_bytes = 0 + logger.info("Found some tables with no profiles need to return 0") + return creation_time, last_altered, rows_count, size_in_bytes @staticmethod diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index 968989e2548d13..9c8e475e7b3074 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -92,16 +92,25 @@ def generate_profile_workunits( request for request in requests if request.profile_table_level_only ] for request in table_level_profile_requests: - table_level_profile = DatasetProfile( - timestampMillis=int(datetime.now().timestamp() * 1000), - columnCount=request.table.column_count, - rowCount=request.table.rows_count, - sizeInBytes=request.table.size_in_bytes, - ) - dataset_urn = self.dataset_urn_builder(request.pretty_name) - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=table_level_profile - ).as_workunit() + if ( + request.table.column_count is None + and request.table.rows_count is None + and request.table.size_in_bytes is None + ): + logger.warning( + f"Table {request.pretty_name} has no column count, rows count, or size in bytes. Skipping emitting table level profile." + ) + else: + table_level_profile = DatasetProfile( + timestampMillis=int(datetime.now().timestamp() * 1000), + columnCount=request.table.column_count, + rowCount=request.table.rows_count, + sizeInBytes=request.table.size_in_bytes, + ) + dataset_urn = self.dataset_urn_builder(request.pretty_name) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=table_level_profile + ).as_workunit() if not ge_profile_requests: return From 36fd614e3b252fdc91fe7155292b8dff092bd6a7 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Mon, 15 Jul 2024 15:14:15 -0400 Subject: [PATCH 02/18] feat(ui) Finalize support for all entity types on forms (#10915) --- datahub-web-react/src/Mocks.tsx | 1 + datahub-web-react/src/graphql/container.graphql | 1 + datahub-web-react/src/graphql/dashboard.graphql | 2 ++ datahub-web-react/src/graphql/dataJob.graphql | 2 ++ datahub-web-react/src/graphql/dataPlatform.graphql | 2 ++ datahub-web-react/src/graphql/dataProduct.graphql | 2 ++ datahub-web-react/src/graphql/dataset.graphql | 1 + datahub-web-react/src/graphql/mlFeature.graphql | 2 ++ datahub-web-react/src/graphql/mlFeatureTable.graphql | 2 ++ datahub-web-react/src/graphql/mlModel.graphql | 2 ++ datahub-web-react/src/graphql/mlModelGroup.graphql | 2 ++ datahub-web-react/src/graphql/mlPrimaryKey.graphql | 2 ++ datahub-web-react/src/graphql/tag.graphql | 1 + datahub-web-react/src/graphql/user.graphql | 1 + 14 files changed, 23 insertions(+) diff --git a/datahub-web-react/src/Mocks.tsx b/datahub-web-react/src/Mocks.tsx index de471b6b9f2fb8..aed672a34e7caf 100644 --- a/datahub-web-react/src/Mocks.tsx +++ b/datahub-web-react/src/Mocks.tsx @@ -1254,6 +1254,7 @@ export const glossaryNode5 = { export const sampleTag = { urn: 'urn:li:tag:abc-sample-tag', + type: EntityType.Tag, name: 'abc-sample-tag', description: 'sample tag description', ownership: { diff --git a/datahub-web-react/src/graphql/container.graphql b/datahub-web-react/src/graphql/container.graphql index 749c1c9172b6d5..4b3ecfe8aaaff9 100644 --- a/datahub-web-react/src/graphql/container.graphql +++ b/datahub-web-react/src/graphql/container.graphql @@ -1,6 +1,7 @@ query getContainer($urn: String!) { container(urn: $urn) { urn + type exists lastIngested platform { diff --git a/datahub-web-react/src/graphql/dashboard.graphql b/datahub-web-react/src/graphql/dashboard.graphql index 68a966a68e00a9..681c98f361ccbf 100644 --- a/datahub-web-react/src/graphql/dashboard.graphql +++ b/datahub-web-react/src/graphql/dashboard.graphql @@ -1,5 +1,7 @@ query getDashboard($urn: String!) { dashboard(urn: $urn) { + urn + type ...dashboardFields privileges { ...entityPrivileges diff --git a/datahub-web-react/src/graphql/dataJob.graphql b/datahub-web-react/src/graphql/dataJob.graphql index 78247bd460fbb7..836aac35deaf51 100644 --- a/datahub-web-react/src/graphql/dataJob.graphql +++ b/datahub-web-react/src/graphql/dataJob.graphql @@ -1,5 +1,7 @@ query getDataJob($urn: String!) { dataJob(urn: $urn) { + urn + type ...dataJobFields privileges { ...entityPrivileges diff --git a/datahub-web-react/src/graphql/dataPlatform.graphql b/datahub-web-react/src/graphql/dataPlatform.graphql index 6281cf155a5d22..44acbf6737faea 100644 --- a/datahub-web-react/src/graphql/dataPlatform.graphql +++ b/datahub-web-react/src/graphql/dataPlatform.graphql @@ -1,5 +1,7 @@ query getDataPlatform($urn: String!) { dataPlatform(urn: $urn) { + urn + type ...platformFields } } diff --git a/datahub-web-react/src/graphql/dataProduct.graphql b/datahub-web-react/src/graphql/dataProduct.graphql index eb053ca9561315..623ece13dbfc18 100644 --- a/datahub-web-react/src/graphql/dataProduct.graphql +++ b/datahub-web-react/src/graphql/dataProduct.graphql @@ -1,5 +1,7 @@ query getDataProduct($urn: String!) { dataProduct(urn: $urn) { + urn + type ...dataProductFields privileges { ...entityPrivileges diff --git a/datahub-web-react/src/graphql/dataset.graphql b/datahub-web-react/src/graphql/dataset.graphql index 1ca25a6ba3bf6f..fcca919f614235 100644 --- a/datahub-web-react/src/graphql/dataset.graphql +++ b/datahub-web-react/src/graphql/dataset.graphql @@ -1,6 +1,7 @@ query getDataProfiles($urn: String!, $limit: Int, $startTime: Long, $endTime: Long) { dataset(urn: $urn) { urn + type datasetProfiles(limit: $limit, startTimeMillis: $startTime, endTimeMillis: $endTime) { rowCount columnCount diff --git a/datahub-web-react/src/graphql/mlFeature.graphql b/datahub-web-react/src/graphql/mlFeature.graphql index d6a75e16b86f17..2ed5ecfb37fdae 100644 --- a/datahub-web-react/src/graphql/mlFeature.graphql +++ b/datahub-web-react/src/graphql/mlFeature.graphql @@ -1,5 +1,7 @@ query getMLFeature($urn: String!) { mlFeature(urn: $urn) { + urn + type ...nonRecursiveMLFeature privileges { ...entityPrivileges diff --git a/datahub-web-react/src/graphql/mlFeatureTable.graphql b/datahub-web-react/src/graphql/mlFeatureTable.graphql index a6e069c120518a..02efbaf9766e1a 100644 --- a/datahub-web-react/src/graphql/mlFeatureTable.graphql +++ b/datahub-web-react/src/graphql/mlFeatureTable.graphql @@ -1,5 +1,7 @@ query getMLFeatureTable($urn: String!) { mlFeatureTable(urn: $urn) { + urn + type ...nonRecursiveMLFeatureTable privileges { ...entityPrivileges diff --git a/datahub-web-react/src/graphql/mlModel.graphql b/datahub-web-react/src/graphql/mlModel.graphql index 1626bc473213af..2192888caef701 100644 --- a/datahub-web-react/src/graphql/mlModel.graphql +++ b/datahub-web-react/src/graphql/mlModel.graphql @@ -1,5 +1,7 @@ query getMLModel($urn: String!) { mlModel(urn: $urn) { + urn + type ...nonRecursiveMLModel features: relationships(input: { types: ["Consumes"], direction: OUTGOING, start: 0, count: 100 }) { start diff --git a/datahub-web-react/src/graphql/mlModelGroup.graphql b/datahub-web-react/src/graphql/mlModelGroup.graphql index 8ae049c8c0b1db..81ab65d0b9a08d 100644 --- a/datahub-web-react/src/graphql/mlModelGroup.graphql +++ b/datahub-web-react/src/graphql/mlModelGroup.graphql @@ -1,5 +1,7 @@ query getMLModelGroup($urn: String!) { mlModelGroup(urn: $urn) { + urn + type ...nonRecursiveMLModelGroupFields incoming: relationships( input: { diff --git a/datahub-web-react/src/graphql/mlPrimaryKey.graphql b/datahub-web-react/src/graphql/mlPrimaryKey.graphql index 599c4d7fabcac2..d39f9d3fbdfa25 100644 --- a/datahub-web-react/src/graphql/mlPrimaryKey.graphql +++ b/datahub-web-react/src/graphql/mlPrimaryKey.graphql @@ -1,5 +1,7 @@ query getMLPrimaryKey($urn: String!) { mlPrimaryKey(urn: $urn) { + urn + type ...nonRecursiveMLPrimaryKey privileges { ...entityPrivileges diff --git a/datahub-web-react/src/graphql/tag.graphql b/datahub-web-react/src/graphql/tag.graphql index 031d923276bfe3..0bf0953b15fbea 100644 --- a/datahub-web-react/src/graphql/tag.graphql +++ b/datahub-web-react/src/graphql/tag.graphql @@ -1,6 +1,7 @@ query getTag($urn: String!) { tag(urn: $urn) { urn + type name description properties { diff --git a/datahub-web-react/src/graphql/user.graphql b/datahub-web-react/src/graphql/user.graphql index a8a4e902849565..030ef85df7124b 100644 --- a/datahub-web-react/src/graphql/user.graphql +++ b/datahub-web-react/src/graphql/user.graphql @@ -1,6 +1,7 @@ query getUser($urn: String!, $groupsCount: Int!) { corpUser(urn: $urn) { urn + type username isNativeUser exists From 78bffc394846c9887a07c9dee562d172a2fc1893 Mon Sep 17 00:00:00 2001 From: noggi Date: Mon, 15 Jul 2024 12:54:33 -0700 Subject: [PATCH 03/18] Index ExecutionRequestResults status field (#10811) --- datahub-web-react/src/app/ingest/source/utils.ts | 6 ++++++ .../com/linkedin/execution/ExecutionRequestResult.pdl | 6 +++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/datahub-web-react/src/app/ingest/source/utils.ts b/datahub-web-react/src/app/ingest/source/utils.ts index 7e00522ffaf47d..44630b4fca5c2f 100644 --- a/datahub-web-react/src/app/ingest/source/utils.ts +++ b/datahub-web-react/src/app/ingest/source/utils.ts @@ -47,6 +47,7 @@ export const WARNING = 'WARNING'; export const FAILURE = 'FAILURE'; export const CONNECTION_FAILURE = 'CONNECTION_FAILURE'; export const CANCELLED = 'CANCELLED'; +export const ABORTED = 'ABORTED'; export const UP_FOR_RETRY = 'UP_FOR_RETRY'; export const ROLLING_BACK = 'ROLLING_BACK'; export const ROLLED_BACK = 'ROLLED_BACK'; @@ -68,6 +69,7 @@ export const getExecutionRequestStatusIcon = (status: string) => { (status === ROLLED_BACK && WarningOutlined) || (status === ROLLING_BACK && LoadingOutlined) || (status === ROLLBACK_FAILED && CloseCircleOutlined) || + (status === ABORTED && CloseCircleOutlined) || ClockCircleOutlined ); }; @@ -83,6 +85,7 @@ export const getExecutionRequestStatusDisplayText = (status: string) => { (status === ROLLED_BACK && 'Rolled Back') || (status === ROLLING_BACK && 'Rolling Back') || (status === ROLLBACK_FAILED && 'Rollback Failed') || + (status === ABORTED && 'Aborted') || status ); }; @@ -105,6 +108,8 @@ export const getExecutionRequestSummaryText = (status: string) => { return 'Ingestion is in the process of rolling back.'; case ROLLBACK_FAILED: return 'Ingestion rollback failed.'; + case ABORTED: + return 'Ingestion job got aborted due to worker restart.'; default: return 'Ingestion status not recognized.'; } @@ -121,6 +126,7 @@ export const getExecutionRequestStatusDisplayColor = (status: string) => { (status === ROLLED_BACK && 'orange') || (status === ROLLING_BACK && 'orange') || (status === ROLLBACK_FAILED && 'red') || + (status === ABORTED && 'red') || ANTD_GRAY[7] ); }; diff --git a/metadata-models/src/main/pegasus/com/linkedin/execution/ExecutionRequestResult.pdl b/metadata-models/src/main/pegasus/com/linkedin/execution/ExecutionRequestResult.pdl index 29acd0aa523899..606c3a06bc74b7 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/execution/ExecutionRequestResult.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/execution/ExecutionRequestResult.pdl @@ -10,6 +10,10 @@ record ExecutionRequestResult { /** * The status of the execution request */ + @Searchable = { + "fieldType": "KEYWORD", + "fieldName": "executionResultStatus" + } status: string /** @@ -36,4 +40,4 @@ record ExecutionRequestResult { * Duration in milliseconds */ durationMs: optional long -} \ No newline at end of file +} From 437bacb0e6e52236fb1005335aa32979f73d5ad8 Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Tue, 16 Jul 2024 02:42:18 +0530 Subject: [PATCH 04/18] feat(ingest): grafana connector (#10891) Co-authored-by: Shirshanka Das Co-authored-by: Harshal Sheth --- metadata-ingestion/setup.py | 2 + .../ingestion/source/grafana/__init__.py | 0 .../source/grafana/grafana_source.py | 131 ++++++++++++ .../grafana/default-dashboard.json | 25 +++ .../integration/grafana/docker-compose.yml | 32 +++ .../grafana/grafana_mcps_golden.json | 56 +++++ .../provisioning/api-keys/api_keys.yaml | 3 + .../provisioning/dashboards/dashboard.yaml | 11 + .../provisioning/datasources/datasource.yaml | 12 ++ .../service_accounts/service_accounts.yaml | 6 + .../tests/integration/grafana/test_grafana.py | 191 ++++++++++++++++++ 11 files changed, 469 insertions(+) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/grafana/__init__.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/grafana/grafana_source.py create mode 100644 metadata-ingestion/tests/integration/grafana/default-dashboard.json create mode 100644 metadata-ingestion/tests/integration/grafana/docker-compose.yml create mode 100644 metadata-ingestion/tests/integration/grafana/grafana_mcps_golden.json create mode 100644 metadata-ingestion/tests/integration/grafana/provisioning/api-keys/api_keys.yaml create mode 100644 metadata-ingestion/tests/integration/grafana/provisioning/dashboards/dashboard.yaml create mode 100644 metadata-ingestion/tests/integration/grafana/provisioning/datasources/datasource.yaml create mode 100644 metadata-ingestion/tests/integration/grafana/provisioning/service_accounts/service_accounts.yaml create mode 100644 metadata-ingestion/tests/integration/grafana/test_grafana.py diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index b8db746a63fdba..41c04ca4a433cf 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -345,6 +345,7 @@ "flask-openid>=1.3.0", "dask[dataframe]<2024.7.0", }, + "grafana": {"requests"}, "glue": aws_common, # hdbcli is supported officially by SAP, sqlalchemy-hana is built on top but not officially supported "hana": sql_common @@ -635,6 +636,7 @@ "dynamodb = datahub.ingestion.source.dynamodb.dynamodb:DynamoDBSource", "elasticsearch = datahub.ingestion.source.elastic_search:ElasticsearchSource", "feast = datahub.ingestion.source.feast:FeastRepositorySource", + "grafana = datahub.ingestion.source.grafana.grafana_source:GrafanaSource", "glue = datahub.ingestion.source.aws.glue:GlueSource", "sagemaker = datahub.ingestion.source.aws.sagemaker:SagemakerSource", "hana = datahub.ingestion.source.sql.hana:HanaSource", diff --git a/metadata-ingestion/src/datahub/ingestion/source/grafana/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/grafana/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/grafana/grafana_source.py b/metadata-ingestion/src/datahub/ingestion/source/grafana/grafana_source.py new file mode 100644 index 00000000000000..53f71046c25c0d --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/grafana/grafana_source.py @@ -0,0 +1,131 @@ +from typing import Iterable, List, Optional + +import requests +from pydantic import Field, SecretStr + +import datahub.emitter.mce_builder as builder +from datahub.configuration.source_common import PlatformInstanceConfigMixin +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SupportStatus, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor +from datahub.ingestion.api.source_helpers import auto_workunit +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, + StaleEntityRemovalSourceReport, + StatefulIngestionConfigBase, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionReport, + StatefulIngestionSourceBase, +) +from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps +from datahub.metadata.schema_classes import DashboardInfoClass, StatusClass + + +class GrafanaSourceConfig(StatefulIngestionConfigBase, PlatformInstanceConfigMixin): + url: str = Field( + default="", + description="Grafana URL in the format http://your-grafana-instance with no trailing slash", + ) + service_account_token: SecretStr = Field( + description="Service account token for Grafana" + ) + + +class GrafanaReport(StaleEntityRemovalSourceReport): + pass + + +@platform_name("Grafana") +@config_class(GrafanaSourceConfig) +@support_status(SupportStatus.TESTING) +class GrafanaSource(StatefulIngestionSourceBase): + """ + This is an experimental source for Grafana. + Currently only ingests dashboards (no charts) + """ + + def __init__(self, config: GrafanaSourceConfig, ctx: PipelineContext): + super().__init__(config, ctx) + self.source_config = config + self.report = GrafanaReport() + self.platform = "grafana" + + @classmethod + def create(cls, config_dict, ctx): + config = GrafanaSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.source_config, self.ctx + ).workunit_processor, + ] + + def get_report(self) -> StatefulIngestionReport: + return self.report + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + headers = { + "Authorization": f"Bearer {self.source_config.service_account_token.get_secret_value()}", + "Content-Type": "application/json", + } + try: + response = requests.get( + f"{self.source_config.url}/api/search", headers=headers + ) + response.raise_for_status() + except requests.exceptions.RequestException as e: + self.report.report_failure(f"Failed to fetch dashboards: {str(e)}") + return + res_json = response.json() + for item in res_json: + uid = item["uid"] + title = item["title"] + url_path = item["url"] + full_url = f"{self.source_config.url}{url_path}" + dashboard_urn = builder.make_dashboard_urn( + platform=self.platform, + name=uid, + platform_instance=self.source_config.platform_instance, + ) + + yield from auto_workunit( + MetadataChangeProposalWrapper.construct_many( + entityUrn=dashboard_urn, + aspects=[ + DashboardInfoClass( + description="", + title=title, + charts=[], + lastModified=ChangeAuditStamps(), + externalUrl=full_url, + customProperties={ + key: str(value) + for key, value in { + "displayName": title, + "id": item["id"], + "uid": uid, + "title": title, + "uri": item["uri"], + "type": item["type"], + "folderId": item.get("folderId"), + "folderUid": item.get("folderUid"), + "folderTitle": item.get("folderTitle"), + }.items() + if value is not None + }, + ), + StatusClass(removed=False), + ], + ) + ) diff --git a/metadata-ingestion/tests/integration/grafana/default-dashboard.json b/metadata-ingestion/tests/integration/grafana/default-dashboard.json new file mode 100644 index 00000000000000..8ce40ad6acb13a --- /dev/null +++ b/metadata-ingestion/tests/integration/grafana/default-dashboard.json @@ -0,0 +1,25 @@ +{ + "id": null, + "uid": "default", + "title": "Default Dashboard", + "tags": [], + "timezone": "browser", + "schemaVersion": 16, + "version": 0, + "panels": [ + { + "type": "text", + "title": "Welcome", + "gridPos": { + "x": 0, + "y": 0, + "w": 24, + "h": 5 + }, + "options": { + "content": "Welcome to your Grafana dashboard!", + "mode": "markdown" + } + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/grafana/docker-compose.yml b/metadata-ingestion/tests/integration/grafana/docker-compose.yml new file mode 100644 index 00000000000000..41995a1d49da60 --- /dev/null +++ b/metadata-ingestion/tests/integration/grafana/docker-compose.yml @@ -0,0 +1,32 @@ +version: '3.7' + +services: + grafana: + image: grafana/grafana:latest + container_name: grafana + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_SECURITY_ADMIN_USER=admin + - GF_PATHS_PROVISIONING=/etc/grafana/provisioning + volumes: + - grafana-storage:/var/lib/grafana + - ./provisioning:/etc/grafana/provisioning + - ./default-dashboard.json:/var/lib/grafana/dashboards/default-dashboard.json + depends_on: + - postgres + + postgres: + image: postgres:13 + container_name: grafana-postgres + environment: + POSTGRES_DB: grafana + POSTGRES_USER: grafana + POSTGRES_PASSWORD: grafana + volumes: + - postgres-storage:/var/lib/postgresql/data + +volumes: + grafana-storage: + postgres-storage: diff --git a/metadata-ingestion/tests/integration/grafana/grafana_mcps_golden.json b/metadata-ingestion/tests/integration/grafana/grafana_mcps_golden.json new file mode 100644 index 00000000000000..1447e840eac8cd --- /dev/null +++ b/metadata-ingestion/tests/integration/grafana/grafana_mcps_golden.json @@ -0,0 +1,56 @@ +[ +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(grafana,default)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "json": { + "customProperties": { + "displayName": "Default Dashboard", + "id": "1", + "uid": "default", + "title": "Default Dashboard", + "uri": "db/default-dashboard", + "type": "dash-db" + }, + "externalUrl": "http://localhost:3000/d/default/default-dashboard", + "title": "Default Dashboard", + "description": "", + "charts": [], + "datasets": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + }, + "systemMetadata": { + "lastObserved": 1720785600000, + "runId": "grafana-test-simple", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(grafana,default)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1720785600000, + "runId": "grafana-test-simple", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/grafana/provisioning/api-keys/api_keys.yaml b/metadata-ingestion/tests/integration/grafana/provisioning/api-keys/api_keys.yaml new file mode 100644 index 00000000000000..7ef096b6bfe973 --- /dev/null +++ b/metadata-ingestion/tests/integration/grafana/provisioning/api-keys/api_keys.yaml @@ -0,0 +1,3 @@ +api_keys: + - name: 'example-api-key' + role: 'Admin' diff --git a/metadata-ingestion/tests/integration/grafana/provisioning/dashboards/dashboard.yaml b/metadata-ingestion/tests/integration/grafana/provisioning/dashboards/dashboard.yaml new file mode 100644 index 00000000000000..e6d4aa3a45a39d --- /dev/null +++ b/metadata-ingestion/tests/integration/grafana/provisioning/dashboards/dashboard.yaml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + updateIntervalSeconds: 10 + options: + path: /var/lib/grafana/dashboards diff --git a/metadata-ingestion/tests/integration/grafana/provisioning/datasources/datasource.yaml b/metadata-ingestion/tests/integration/grafana/provisioning/datasources/datasource.yaml new file mode 100644 index 00000000000000..9ba65ec1a54bc6 --- /dev/null +++ b/metadata-ingestion/tests/integration/grafana/provisioning/datasources/datasource.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 + +datasources: + - name: PostgreSQL + type: postgres + access: proxy + url: postgres:5432 + database: grafana + user: grafana + password: grafana + jsonData: + sslmode: disable diff --git a/metadata-ingestion/tests/integration/grafana/provisioning/service_accounts/service_accounts.yaml b/metadata-ingestion/tests/integration/grafana/provisioning/service_accounts/service_accounts.yaml new file mode 100644 index 00000000000000..a6c259aac77abd --- /dev/null +++ b/metadata-ingestion/tests/integration/grafana/provisioning/service_accounts/service_accounts.yaml @@ -0,0 +1,6 @@ +service_accounts: + - name: 'example-service-account' + role: 'Admin' + apiKeys: + - keyName: 'example-api-key' + role: 'Admin' diff --git a/metadata-ingestion/tests/integration/grafana/test_grafana.py b/metadata-ingestion/tests/integration/grafana/test_grafana.py new file mode 100644 index 00000000000000..6eb6b0b8509263 --- /dev/null +++ b/metadata-ingestion/tests/integration/grafana/test_grafana.py @@ -0,0 +1,191 @@ +import logging +import time +from base64 import b64encode + +import pytest +import requests +from freezegun import freeze_time + +from datahub.ingestion.run.pipeline import Pipeline +from tests.test_helpers import fs_helpers, mce_helpers +from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port + +pytestmark = pytest.mark.integration_batch_2 + +FROZEN_TIME = "2024-07-12 12:00:00" + + +logger = logging.getLogger(__name__) + + +class GrafanaClient: + def __init__(self, url, admin_user, admin_password): + self.url = url + self.auth = (admin_user, admin_password) + self.headers = { + "Authorization": f"Basic {b64encode(f'{admin_user}:{admin_password}'.encode()).decode()}", + "Content-Type": "application/json", + } + + def create_service_account(self, name, role): + service_account_payload = {"name": name, "role": role, "isDisabled": False} + try: + response = requests.post( + f"{self.url}/api/serviceaccounts", + headers=self.headers, + json=service_account_payload, + ) + response.raise_for_status() + service_account = response.json() + return service_account + except requests.exceptions.RequestException as e: + logging.error(f"Error creating service account: {e}") + return None + + def create_api_key(self, service_account_id, key_name, role): + api_key_payload = {"name": key_name, "role": role} + try: + response = requests.post( + f"{self.url}/api/serviceaccounts/{service_account_id}/tokens", + headers=self.headers, + json=api_key_payload, + ) + response.raise_for_status() + api_key = response.json() + return api_key["key"] + except requests.exceptions.RequestException as e: + logging.error(f"Error creating API key: {e}") + return None + + +@pytest.fixture(scope="module") +def test_resources_dir(pytestconfig): + return pytestconfig.rootpath / "tests/integration/grafana" + + +@pytest.fixture(scope="module") +def test_api_key(): + # Example usage: + url = "http://localhost:3000" + admin_user = "admin" + admin_password = "admin" + + grafana_client = GrafanaClient(url, admin_user, admin_password) + + # Step 1: Create the service account + service_account = grafana_client.create_service_account( + name="example-service-account", role="Viewer" + ) + if service_account: + print(f"Service Account Created: {service_account}") + + # Step 2: Create the API key for the service account + api_key = grafana_client.create_api_key( + service_account_id=service_account["id"], + key_name="example-api-key", + role="Admin", + ) + if api_key: + print("Service Account API Key:", api_key) + return api_key + else: + print("Failed to create API key for the service account") + else: + print("Failed to create service account") + + +@pytest.fixture(scope="module") +def loaded_grafana(docker_compose_runner, test_resources_dir): + with docker_compose_runner( + test_resources_dir / "docker-compose.yml", "grafana" + ) as docker_services: + wait_for_port( + docker_services, + container_name="grafana", + container_port=3000, + timeout=300, + ) + yield docker_services + + # The Grafana image can be large, so we remove it after the test. + cleanup_image("grafana/grafana") + + +@freeze_time(FROZEN_TIME) +def test_grafana_dashboard(loaded_grafana, pytestconfig, tmp_path, test_resources_dir): + # Wait for Grafana to be up and running + url = "http://localhost:3000/api/health" + for i in range(30): + logging.info("waiting for Grafana to start...") + time.sleep(5) + resp = requests.get(url) + if resp.status_code == 200: + logging.info(f"Grafana started after waiting {i*5} seconds") + break + else: + pytest.fail("Grafana did not start in time") + + # Check if the default dashboard is loaded + dashboard_url = "http://localhost:3000/api/dashboards/uid/default" + resp = requests.get(dashboard_url, auth=("admin", "admin")) + assert resp.status_code == 200, "Failed to load default dashboard" + dashboard = resp.json() + + assert ( + dashboard["dashboard"]["title"] == "Default Dashboard" + ), "Default dashboard title mismatch" + assert any( + panel["type"] == "text" for panel in dashboard["dashboard"]["panels"] + ), "Default dashboard missing text panel" + + # Verify the output. (You can add further checks here if needed) + logging.info("Default dashboard verified successfully") + + +@freeze_time(FROZEN_TIME) +def test_grafana_ingest( + loaded_grafana, pytestconfig, tmp_path, test_resources_dir, test_api_key +): + # Wait for Grafana to be up and running + url = "http://localhost:3000/api/health" + for i in range(30): + logging.info("waiting for Grafana to start...") + time.sleep(5) + resp = requests.get(url) + if resp.status_code == 200: + logging.info(f"Grafana started after waiting {i*5} seconds") + break + else: + pytest.fail("Grafana did not start in time") + + # Run the metadata ingestion pipeline. + with fs_helpers.isolated_filesystem(tmp_path): + # Run grafana ingestion run. + pipeline = Pipeline.create( + { + "run_id": "grafana-test-simple", + "source": { + "type": "grafana", + "config": { + "url": "http://localhost:3000", + "service_account_token": test_api_key, + }, + }, + "sink": { + "type": "file", + "config": {"filename": "./grafana_mcps.json"}, + }, + } + ) + pipeline.run() + pipeline.raise_from_status() + + # Verify the output. + mce_helpers.check_golden_file( + pytestconfig, + output_path="grafana_mcps.json", + golden_path=test_resources_dir / "grafana_mcps_golden.json", + ignore_paths=[ + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['last_event_time'\]", + ], + ) From 5f38d13eb7d8d52357753054a870b7a41d0f0c13 Mon Sep 17 00:00:00 2001 From: Chris Collins Date: Mon, 15 Jul 2024 18:47:36 -0400 Subject: [PATCH 05/18] fix(gms) Add Form entity type to EntityTypeMapper (#10916) --- .../datahub/graphql/types/entitytype/EntityTypeMapper.java | 1 + 1 file changed, 1 insertion(+) diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java index ffb14df5e800b6..26835f9e57dcd8 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/entitytype/EntityTypeMapper.java @@ -48,6 +48,7 @@ public class EntityTypeMapper { .put(EntityType.BUSINESS_ATTRIBUTE, Constants.BUSINESS_ATTRIBUTE_ENTITY_NAME) .put(EntityType.QUERY, Constants.QUERY_ENTITY_NAME) .put(EntityType.POST, Constants.POST_ENTITY_NAME) + .put(EntityType.FORM, Constants.FORM_ENTITY_NAME) .build(); private static final Map ENTITY_NAME_TO_TYPE = From cfcd216924e077eb0ad2716eb3a7061b49f97023 Mon Sep 17 00:00:00 2001 From: Nicholas Pena Date: Mon, 15 Jul 2024 18:57:23 -0400 Subject: [PATCH 06/18] feat(dataset): add support for external url in Dataset (#10877) --- metadata-ingestion/src/datahub/api/entities/dataset/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py index c71bced38f8aa9..afeedb83f79983 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py +++ b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py @@ -162,6 +162,7 @@ class Dataset(BaseModel): structured_properties: Optional[ Dict[str, Union[str, float, List[Union[str, float]]]] ] = None + external_url: Optional[str] = None @property def platform_urn(self) -> str: @@ -236,6 +237,7 @@ def generate_mcp( description=self.description, name=self.name, customProperties=self.properties, + externalUrl=self.external_url, ), ) yield mcp From 643ba0c5cf8bc8d8c334c6d50982491c77f48a0e Mon Sep 17 00:00:00 2001 From: Jay <159848059+jayacryl@users.noreply.github.com> Date: Mon, 15 Jul 2024 20:39:49 -0400 Subject: [PATCH 07/18] docs(saas-overview) added missing features to observe section (#10913) Co-authored-by: John Joyce --- docs/managed-datahub/managed-datahub-overview.md | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/docs/managed-datahub/managed-datahub-overview.md b/docs/managed-datahub/managed-datahub-overview.md index 867b03501e0e06..087238097dd9f4 100644 --- a/docs/managed-datahub/managed-datahub-overview.md +++ b/docs/managed-datahub/managed-datahub-overview.md @@ -51,10 +51,17 @@ know. | Feature | DataHub | Acryl DataHub | | ---------------------------------------------- | ------- | ------------- | | Surface data quality results | ✅ | ✅ | -| Build and enforce continuous data SLAs | ❌ | ✅ | -| Continuous monitoring of dataset health | ❌ | ✅ | -| Data observability alerts and notifications | ❌ | ✅ | -| Data Incident management | ❌ | ✅ | +| Create data contracts | ✅ | ✅ | +| Raise and Resolve Data Incidents | ✅ | ✅ | +| Monitor Freshness SLAs | ❌ | ✅ | +| Monitor Table Schemas | ❌ | ✅ | +| Monitor Table Volume | ❌ | ✅ | +| Validate Table Columns | ❌ | ✅ | +| Receive Notifications via Email & Slack | ❌ | ✅ | +| Manage Data Incidents via Slack | ❌ | ✅ | +| View Data Health Dashboard | ❌ | ✅ | +| Evaluate data quality checks on-demand (API + UI) | ❌ | ✅ | +| Evaluate data quality checks in your VPC | ❌ | ✅ | ## Enterprise Grade From 642fb404ec9848e44b96b1d0c9c8c677dbe92ba0 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Tue, 16 Jul 2024 11:50:15 +0200 Subject: [PATCH 08/18] fix(ingest/spark): Fixing Micrometer warning (#10882) --- metadata-integration/java/spark-lineage-beta/README.md | 4 +++- .../src/main/java/datahub/spark/DatahubSparkListener.java | 8 +------- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/metadata-integration/java/spark-lineage-beta/README.md b/metadata-integration/java/spark-lineage-beta/README.md index a643919664b079..b0753936dd677b 100644 --- a/metadata-integration/java/spark-lineage-beta/README.md +++ b/metadata-integration/java/spark-lineage-beta/README.md @@ -346,8 +346,10 @@ Use Java 8 to build the project. The project uses Gradle as the build tool. To b + ## Changelog +### Version 0.2.14 +- Fix warning about MeterFilter warning from Micrometer -### Version 0.2.12 +### Version 0.2.13 - Silencing some chatty warnings in RddPathUtils ### Version 0.2.12 diff --git a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubSparkListener.java b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubSparkListener.java index 54bb3821eddedf..96fa74d1bca1fd 100644 --- a/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubSparkListener.java +++ b/metadata-integration/java/spark-lineage-beta/src/main/java/datahub/spark/DatahubSparkListener.java @@ -287,13 +287,7 @@ private static void initializeMetrics(OpenLineageConfig openLineageConfig) { } else { disabledFacets = ""; } - meterRegistry - .config() - .commonTags( - Tags.of( - Tag.of("openlineage.spark.integration.version", Versions.getVersion()), - Tag.of("openlineage.spark.version", sparkVersion), - Tag.of("openlineage.spark.disabled.facets", disabledFacets))); + ((CompositeMeterRegistry) meterRegistry) .getRegistries() .forEach( From ea2216ad6453d1b7dad144bef46ca7fd64cb3f00 Mon Sep 17 00:00:00 2001 From: Gabe Lyons Date: Tue, 16 Jul 2024 08:18:37 -0700 Subject: [PATCH 09/18] fix(structured properties): allow application of structured properties without schema file (#10918) --- .../datahub/api/entities/dataset/dataset.py | 98 +++++++++---------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py index afeedb83f79983..f9a188c65feeff 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py +++ b/metadata-ingestion/src/datahub/api/entities/dataset/dataset.py @@ -259,56 +259,56 @@ def generate_mcp( ) yield mcp - if self.schema_metadata.fields: - for field in self.schema_metadata.fields: - field_urn = field.urn or make_schema_field_urn( - self.urn, field.id # type: ignore[arg-type] + if self.schema_metadata.fields: + for field in self.schema_metadata.fields: + field_urn = field.urn or make_schema_field_urn( + self.urn, field.id # type: ignore[arg-type] + ) + assert field_urn.startswith("urn:li:schemaField:") + + if field.globalTags: + mcp = MetadataChangeProposalWrapper( + entityUrn=field_urn, + aspect=GlobalTagsClass( + tags=[ + TagAssociationClass(tag=make_tag_urn(tag)) + for tag in field.globalTags + ] + ), ) - assert field_urn.startswith("urn:li:schemaField:") - - if field.globalTags: - mcp = MetadataChangeProposalWrapper( - entityUrn=field_urn, - aspect=GlobalTagsClass( - tags=[ - TagAssociationClass(tag=make_tag_urn(tag)) - for tag in field.globalTags - ] - ), - ) - yield mcp - - if field.glossaryTerms: - mcp = MetadataChangeProposalWrapper( - entityUrn=field_urn, - aspect=GlossaryTermsClass( - terms=[ - GlossaryTermAssociationClass( - urn=make_term_urn(term) - ) - for term in field.glossaryTerms - ], - auditStamp=self._mint_auditstamp("yaml"), - ), - ) - yield mcp - - if field.structured_properties: - mcp = MetadataChangeProposalWrapper( - entityUrn=field_urn, - aspect=StructuredPropertiesClass( - properties=[ - StructuredPropertyValueAssignmentClass( - propertyUrn=f"urn:li:structuredProperty:{prop_key}", - values=prop_value - if isinstance(prop_value, list) - else [prop_value], - ) - for prop_key, prop_value in field.structured_properties.items() - ] - ), - ) - yield mcp + yield mcp + + if field.glossaryTerms: + mcp = MetadataChangeProposalWrapper( + entityUrn=field_urn, + aspect=GlossaryTermsClass( + terms=[ + GlossaryTermAssociationClass( + urn=make_term_urn(term) + ) + for term in field.glossaryTerms + ], + auditStamp=self._mint_auditstamp("yaml"), + ), + ) + yield mcp + + if field.structured_properties: + mcp = MetadataChangeProposalWrapper( + entityUrn=field_urn, + aspect=StructuredPropertiesClass( + properties=[ + StructuredPropertyValueAssignmentClass( + propertyUrn=f"urn:li:structuredProperty:{prop_key}", + values=prop_value + if isinstance(prop_value, list) + else [prop_value], + ) + for prop_key, prop_value in field.structured_properties.items() + ] + ), + ) + yield mcp if self.subtype or self.subtypes: mcp = MetadataChangeProposalWrapper( From ee18a174d1a373f692006920077d5b6e11741059 Mon Sep 17 00:00:00 2001 From: Jay <159848059+jayacryl@users.noreply.github.com> Date: Tue, 16 Jul 2024 12:56:56 -0400 Subject: [PATCH 10/18] fix(data-contracts-web) handle other schedule types (#10919) --- .../contract/FreshnessScheduleSummary.tsx | 33 +++++++++++++------ 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/contract/FreshnessScheduleSummary.tsx b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/contract/FreshnessScheduleSummary.tsx index 434ccb985574f7..5009587c0d2775 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/contract/FreshnessScheduleSummary.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Dataset/Validations/contract/FreshnessScheduleSummary.tsx @@ -13,16 +13,29 @@ type Props = { }; export const FreshnessScheduleSummary = ({ definition, evaluationSchedule }: Props) => { - const scheduleText = - definition.type === FreshnessAssertionScheduleType.Cron - ? `${capitalizeFirstLetter(cronstrue.toString(definition.cron?.cron as string))}.` - : `In the past ${ - definition.fixedInterval?.multiple - } ${definition.fixedInterval?.unit.toLocaleLowerCase()}s${ - (evaluationSchedule && - `, as of ${cronstrue.toString(evaluationSchedule.cron as string).toLowerCase()}`) || - '' - }`; + let scheduleText = ''; + const cronStr = definition.cron?.cron ?? evaluationSchedule?.cron; + switch (definition.type) { + case FreshnessAssertionScheduleType.Cron: + scheduleText = cronStr + ? `${capitalizeFirstLetter(cronstrue.toString(cronStr))}.` + : `Unknown freshness schedule.`; + break; + case FreshnessAssertionScheduleType.SinceTheLastCheck: + scheduleText = cronStr + ? `Since the previous check, as of ${cronstrue.toString(cronStr).toLowerCase()}` + : 'Since the previous check'; + break; + case FreshnessAssertionScheduleType.FixedInterval: + scheduleText = `In the past ${ + definition.fixedInterval?.multiple + } ${definition.fixedInterval?.unit.toLocaleLowerCase()}s${ + cronStr ? `, as of ${cronstrue.toString(cronStr).toLowerCase()}` : '' + }`; + break; + default: + break; + } return <>{scheduleText}; }; From bb1ba091cddda253226aa89ba219f663a2e9f7bc Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Tue, 16 Jul 2024 23:06:51 +0530 Subject: [PATCH 11/18] fix(ingestion/tableau): human-readable message for PERMISSIONS_MODE_SWITCHED error (#10866) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/source/tableau.py | 32 ++++- .../setup/permission_mode_switched_error.json | 16 +++ .../tableau/test_tableau_ingest.py | 112 ++++++++++++++---- 3 files changed, 132 insertions(+), 28 deletions(-) create mode 100644 metadata-ingestion/tests/integration/tableau/setup/permission_mode_switched_error.json diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index b14a4a8586c7d8..50fd8ed3dff597 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -1009,10 +1009,34 @@ def get_connection_object_page( error and (error.get(c.EXTENSIONS) or {}).get(c.SEVERITY) == c.WARNING for error in errors ): - self.report.warning( - message=f"Received error fetching Query Connection {connection_type}", - context=f"Errors: {errors}", - ) + # filter out PERMISSIONS_MODE_SWITCHED to report error in human-readable format + other_errors = [] + permission_mode_errors = [] + for error in errors: + if ( + error.get("extensions") + and error["extensions"].get("code") + == "PERMISSIONS_MODE_SWITCHED" + ): + permission_mode_errors.append(error) + else: + other_errors.append(error) + + if other_errors: + self.report.warning( + message=f"Received error fetching Query Connection {connection_type}", + context=f"Errors: {other_errors}", + ) + + if permission_mode_errors: + self.report.warning( + title="Derived Permission Error", + message="Turn on your derived permissions. See for details " + "https://community.tableau.com/s/question/0D54T00000QnjHbSAJ/how-to-fix-the" + "-permissionsmodeswitched-error", + context=f"{permission_mode_errors}", + ) + else: raise RuntimeError(f"Query {connection_type} error: {errors}") diff --git a/metadata-ingestion/tests/integration/tableau/setup/permission_mode_switched_error.json b/metadata-ingestion/tests/integration/tableau/setup/permission_mode_switched_error.json new file mode 100644 index 00000000000000..a8593493a5ec72 --- /dev/null +++ b/metadata-ingestion/tests/integration/tableau/setup/permission_mode_switched_error.json @@ -0,0 +1,16 @@ +{ + "errors":[ + { + "message": "One or more of the attributes used in your filter contain sensitive data so your results have been automatically filtered to contain only the results you have permissions to see", + "extensions": { + "severity": "WARNING", + "code": "PERMISSIONS_MODE_SWITCHED", + "properties": { + "workbooksConnection": [ + "projectNameWithin" + ] + } + } + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index b64609b6ea605f..0891a1e0cd5937 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -2,7 +2,7 @@ import logging import pathlib import sys -from typing import Any, Dict, cast +from typing import Any, Dict, List, cast from unittest import mock import pytest @@ -232,6 +232,41 @@ def side_effect_site_get_by_id(id, *arg, **kwargs): return site +def mock_sdk_client( + side_effect_query_metadata_response: List[dict], + datasources_side_effect: List[dict], + sign_out_side_effect: List[dict], +) -> mock.MagicMock: + + mock_client = mock.Mock() + mocked_metadata = mock.Mock() + mocked_metadata.query.side_effect = side_effect_query_metadata_response + mock_client.metadata = mocked_metadata + + mock_client.auth = mock.Mock() + mock_client.site_id = "190a6a5c-63ed-4de1-8045-site1" + mock_client.views = mock.Mock() + mock_client.projects = mock.Mock() + mock_client.sites = mock.Mock() + + mock_client.projects.get.side_effect = side_effect_project_data + mock_client.sites.get.side_effect = side_effect_site_data + mock_client.sites.get_by_id.side_effect = side_effect_site_get_by_id + + mock_client.datasources = mock.Mock() + mock_client.datasources.get.side_effect = datasources_side_effect + mock_client.datasources.get_by_id.side_effect = side_effect_datasource_get_by_id + + mock_client.workbooks = mock.Mock() + mock_client.workbooks.get.side_effect = side_effect_workbook_data + + mock_client.views.get.side_effect = side_effect_usage_stat + mock_client.auth.sign_in.return_value = None + mock_client.auth.sign_out.side_effect = sign_out_side_effect + + return mock_client + + def tableau_ingest_common( pytestconfig, tmp_path, @@ -251,30 +286,11 @@ def tableau_ingest_common( mock_checkpoint.return_value = mock_datahub_graph with mock.patch("datahub.ingestion.source.tableau.Server") as mock_sdk: - mock_client = mock.Mock() - mocked_metadata = mock.Mock() - mocked_metadata.query.side_effect = side_effect_query_metadata_response - mock_client.metadata = mocked_metadata - mock_client.auth = mock.Mock() - mock_client.site_id = "190a6a5c-63ed-4de1-8045-site1" - mock_client.views = mock.Mock() - mock_client.projects = mock.Mock() - mock_client.sites = mock.Mock() - - mock_client.projects.get.side_effect = side_effect_project_data - mock_client.sites.get.side_effect = side_effect_site_data - mock_client.sites.get_by_id.side_effect = side_effect_site_get_by_id - mock_client.datasources = mock.Mock() - mock_client.datasources.get.side_effect = datasources_side_effect - mock_client.datasources.get_by_id.side_effect = ( - side_effect_datasource_get_by_id + mock_sdk.return_value = mock_sdk_client( + side_effect_query_metadata_response=side_effect_query_metadata_response, + datasources_side_effect=datasources_side_effect, + sign_out_side_effect=sign_out_side_effect, ) - mock_client.workbooks = mock.Mock() - mock_client.workbooks.get.side_effect = side_effect_workbook_data - mock_client.views.get.side_effect = side_effect_usage_stat - mock_client.auth.sign_in.return_value = None - mock_client.auth.sign_out.side_effect = sign_out_side_effect - mock_sdk.return_value = mock_client mock_sdk._auth_token = "ABC" pipeline = Pipeline.create( @@ -1106,3 +1122,51 @@ def test_site_name_pattern(pytestconfig, tmp_path, mock_datahub_graph): pipeline_config=new_config, pipeline_name="test_tableau_site_name_pattern_ingest", ) + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_permission_mode_switched_error(pytestconfig, tmp_path, mock_datahub_graph): + + with mock.patch( + "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", + mock_datahub_graph, + ) as mock_checkpoint: + mock_checkpoint.return_value = mock_datahub_graph + + with mock.patch("datahub.ingestion.source.tableau.Server") as mock_sdk: + mock_sdk.return_value = mock_sdk_client( + side_effect_query_metadata_response=[ + read_response(pytestconfig, "permission_mode_switched_error.json") + ], + sign_out_side_effect=[{}], + datasources_side_effect=[{}], + ) + + reporter = TableauSourceReport() + tableau_source = TableauSiteSource( + platform="tableau", + config=mock.MagicMock(), + ctx=mock.MagicMock(), + site=mock.MagicMock(), + server=mock_sdk.return_value, + report=reporter, + ) + + tableau_source.get_connection_object_page( + query=mock.MagicMock(), + connection_type=mock.MagicMock(), + query_filter=mock.MagicMock(), + retries_remaining=1, + ) + + warnings = list(reporter.warnings) + + assert len(warnings) == 1 + + assert warnings[0].title == "Derived Permission Error" + + assert warnings[0].message == ( + "Turn on your derived permissions. See for details " + "https://community.tableau.com/s/question/0D54T00000QnjHbSAJ/how-to-fix-the-permissionsmodeswitched-error" + ) From 12ee4853022fc29ec2f303e994529a8bfb8291b8 Mon Sep 17 00:00:00 2001 From: ethan-cartwright Date: Tue, 16 Jul 2024 13:54:43 -0400 Subject: [PATCH 12/18] Add feature flag for view defintions (#10914) Co-authored-by: Ethan Cartwright --- .../datahub/ingestion/source/snowflake/snowflake_config.py | 5 +++++ .../ingestion/source/snowflake/snowflake_schema_gen.py | 6 +++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index f6247eb949417b..365e32dac3e696 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -181,6 +181,11 @@ class SnowflakeV2Config( description="If enabled, populates the snowflake usage statistics. Requires appropriate grants given to the role.", ) + include_view_definitions: bool = Field( + default=True, + description="If enabled, populates the ingested views' definitions.", + ) + include_technical_schema: bool = Field( default=True, description="If enabled, populates the snowflake technical schema and descriptions.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index e604ed96b8eb6b..dcc18635de32c3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -753,7 +753,11 @@ def gen_dataset_workunits( view_properties_aspect = ViewProperties( materialized=table.materialized, viewLanguage="SQL", - viewLogic=table.view_definition, + viewLogic=( + table.view_definition + if self.config.include_view_definitions + else "" + ), ) yield MetadataChangeProposalWrapper( From ff1c6b895e2a605263a0a138aeb88aa7703f4d33 Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Wed, 17 Jul 2024 00:16:42 +0530 Subject: [PATCH 13/18] feat(ingest/BigQuery): refactor+parallelize dataset metadata extraction (#10884) --- .../docs/dev_guides/classification.md | 2 +- .../datahub/ingestion/glossary/classifier.py | 2 +- .../ingestion/source/bigquery_v2/bigquery.py | 1248 +---------------- .../bigquery_v2/bigquery_audit_log_api.py | 2 + .../source/bigquery_v2/bigquery_config.py | 14 +- .../source/bigquery_v2/bigquery_report.py | 25 +- .../source/bigquery_v2/bigquery_schema.py | 282 ++-- .../source/bigquery_v2/bigquery_schema_gen.py | 1090 ++++++++++++++ .../bigquery_v2/bigquery_test_connection.py | 178 +++ .../ingestion/source/bigquery_v2/lineage.py | 175 ++- .../ingestion/source/bigquery_v2/profiler.py | 5 +- .../ingestion/source/bigquery_v2/usage.py | 60 +- .../source/snowflake/snowflake_schema_gen.py | 48 +- .../utilities/threaded_iterator_executor.py | 52 + .../integration/bigquery_v2/test_bigquery.py | 6 +- .../tests/unit/test_bigquery_source.py | 30 +- .../test_threaded_iterator_executor.py | 14 + 17 files changed, 1682 insertions(+), 1551 deletions(-) create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py create mode 100644 metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py create mode 100644 metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py create mode 100644 metadata-ingestion/tests/unit/utilities/test_threaded_iterator_executor.py diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md index f20638a2ab5bde..39eac229a66013 100644 --- a/metadata-ingestion/docs/dev_guides/classification.md +++ b/metadata-ingestion/docs/dev_guides/classification.md @@ -10,7 +10,7 @@ Note that a `.` is used to denote nested fields in the YAML recipe. | ------------------------- | -------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | | enabled | | boolean | Whether classification should be used to auto-detect glossary terms | False | | sample_size | | int | Number of sample values used for classification. | 100 | -| max_workers | | int | Number of worker threads to use for classification. Set to 1 to disable. | Number of cpu cores or 4 | +| max_workers | | int | Number of worker processes to use for classification. Set to 1 to disable. | Number of cpu cores or 4 | | info_type_to_term | | Dict[str,string] | Optional mapping to provide glossary term identifier for info type. | By default, info type is used as glossary term identifier. | | classifiers | | Array of object | Classifiers to use to auto-detect glossary terms. If more than one classifier, infotype predictions from the classifier defined later in sequence take precedance. | [{'type': 'datahub', 'config': None}] | | table_pattern | | AllowDenyPattern (see below for fields) | Regex patterns to filter tables for classification. This is used in combination with other patterns in parent config. Specify regex to match the entire table name in `database.schema.table` format. e.g. to match all tables starting with customer in Customer database and public schema, use the regex 'Customer.public.customer.*' | {'allow': ['.*'], 'deny': [], 'ignoreCase': True} | diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py index 99789a49c0b43a..ddcb74e354613a 100644 --- a/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py +++ b/metadata-ingestion/src/datahub/ingestion/glossary/classifier.py @@ -39,7 +39,7 @@ class ClassificationConfig(ConfigModel): max_workers: int = Field( default=(os.cpu_count() or 4), - description="Number of worker threads to use for classification. Set to 1 to disable.", + description="Number of worker processes to use for classification. Set to 1 to disable.", ) table_pattern: AllowDenyPattern = Field( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 5046f52cdce26e..7a96b2f0643ab0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -2,24 +2,9 @@ import functools import logging import os -import re -import traceback -from collections import defaultdict -from datetime import datetime, timedelta -from typing import Dict, Iterable, List, Optional, Set, Type, Union, cast +from typing import Iterable, List, Optional -from google.cloud import bigquery -from google.cloud.bigquery.table import TableListItem - -from datahub.configuration.pattern_utils import is_schema_allowed, is_tag_allowed -from datahub.emitter.mce_builder import ( - make_data_platform_urn, - make_dataplatform_instance_urn, - make_dataset_urn, - make_tag_urn, -) -from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey +from datahub.emitter.mce_builder import make_dataset_urn from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, @@ -30,54 +15,31 @@ ) from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.source import ( - CapabilityReport, MetadataWorkUnitProcessor, SourceCapability, TestableSource, TestConnectionReport, ) from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.glossary.classification_mixin import ( - SAMPLE_SIZE_MULTIPLIER, - ClassificationHandler, - classification_workunit_processor, -) from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( BigqueryTableIdentifier, BigQueryTableRef, ) from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config -from datahub.ingestion.source.bigquery_v2.bigquery_data_reader import BigQueryDataReader -from datahub.ingestion.source.bigquery_v2.bigquery_helper import ( - unquote_and_decode_unicode_escape_seq, -) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( - BigqueryColumn, - BigqueryDataset, BigqueryProject, BigQuerySchemaApi, - BigqueryTable, - BigqueryTableSnapshot, - BigqueryView, ) -from datahub.ingestion.source.bigquery_v2.common import ( - BQ_EXTERNAL_DATASET_URL_TEMPLATE, - BQ_EXTERNAL_TABLE_URL_TEMPLATE, +from datahub.ingestion.source.bigquery_v2.bigquery_schema_gen import ( + BigQuerySchemaGenerator, +) +from datahub.ingestion.source.bigquery_v2.bigquery_test_connection import ( + BigQueryTestConnection, ) from datahub.ingestion.source.bigquery_v2.lineage import BigqueryLineageExtractor from datahub.ingestion.source.bigquery_v2.profiler import BigqueryProfiler from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor -from datahub.ingestion.source.common.subtypes import ( - DatasetContainerSubTypes, - DatasetSubTypes, -) -from datahub.ingestion.source.sql.sql_utils import ( - add_table_to_schema_container, - gen_database_container, - gen_schema_container, - get_domain_wu, -) from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, @@ -89,57 +51,11 @@ from datahub.ingestion.source.state.stateful_ingestion_base import ( StatefulIngestionSourceBase, ) -from datahub.ingestion.source_report.ingestion_stage import ( - METADATA_EXTRACTION, - PROFILING, -) -from datahub.metadata.com.linkedin.pegasus2avro.common import ( - Status, - SubTypes, - TimeStamp, -) -from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( - DatasetProperties, - ViewProperties, -) -from datahub.metadata.com.linkedin.pegasus2avro.schema import ( - ArrayType, - BooleanType, - BytesType, - DateType, - MySqlDDL, - NullType, - NumberType, - RecordType, - SchemaField, - SchemaFieldDataType, - SchemaMetadata, - StringType, - TimeType, -) -from datahub.metadata.schema_classes import ( - DataPlatformInstanceClass, - GlobalTagsClass, - TagAssociationClass, -) from datahub.sql_parsing.schema_resolver import SchemaResolver -from datahub.utilities.file_backed_collections import FileBackedDict -from datahub.utilities.hive_schema_to_avro import ( - HiveColumnToAvroConverter, - get_schema_fields_for_hive_column, -) -from datahub.utilities.mapping import Constants -from datahub.utilities.perf_timer import PerfTimer -from datahub.utilities.ratelimiter import RateLimiter from datahub.utilities.registries.domain_registry import DomainRegistry logger: logging.Logger = logging.getLogger(__name__) -# Handle table snapshots -# See https://cloud.google.com/bigquery/docs/table-snapshots-intro. -SNAPSHOT_TABLE_REGEX = re.compile(r"^(.+)@(\d{13})$") -CLUSTERING_COLUMN_TAG = "CLUSTERING_COLUMN" - # We can't use close as it is not called if the ingestion is not successful def cleanup(config: BigQueryV2Config) -> None: @@ -178,58 +94,18 @@ def cleanup(config: BigQueryV2Config) -> None: supported=True, ) class BigqueryV2Source(StatefulIngestionSourceBase, TestableSource): - # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types - # Note: We use the hive schema parser to parse nested BigQuery types. We also have - # some extra type mappings in that file. - BIGQUERY_FIELD_TYPE_MAPPINGS: Dict[ - str, - Type[ - Union[ - ArrayType, - BytesType, - BooleanType, - NumberType, - RecordType, - StringType, - TimeType, - DateType, - NullType, - ] - ], - ] = { - "BYTES": BytesType, - "BOOL": BooleanType, - "DECIMAL": NumberType, - "NUMERIC": NumberType, - "BIGNUMERIC": NumberType, - "BIGDECIMAL": NumberType, - "FLOAT64": NumberType, - "INT": NumberType, - "INT64": NumberType, - "SMALLINT": NumberType, - "INTEGER": NumberType, - "BIGINT": NumberType, - "TINYINT": NumberType, - "BYTEINT": NumberType, - "STRING": StringType, - "TIME": TimeType, - "TIMESTAMP": TimeType, - "DATE": DateType, - "DATETIME": TimeType, - "GEOGRAPHY": NullType, - "JSON": RecordType, - "INTERVAL": NullType, - "ARRAY": ArrayType, - "STRUCT": RecordType, - } - def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): super().__init__(config, ctx) self.config: BigQueryV2Config = config self.report: BigQueryV2Report = BigQueryV2Report() - self.classification_handler = ClassificationHandler(self.config, self.report) self.platform: str = "bigquery" + self.domain_registry: Optional[DomainRegistry] = None + if self.config.domain: + self.domain_registry = DomainRegistry( + cached_domains=[k for k in self.config.domain], graph=self.ctx.graph + ) + BigqueryTableIdentifier._BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX = ( self.config.sharded_table_pattern ) @@ -247,12 +123,6 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): self.sql_parser_schema_resolver = self._init_schema_resolver() - self.data_reader: Optional[BigQueryDataReader] = None - if self.classification_handler.is_classification_enabled(): - self.data_reader = BigQueryDataReader.create( - self.config.get_bigquery_client() - ) - redundant_lineage_run_skip_handler: Optional[ RedundantLineageRunSkipHandler ] = None @@ -289,12 +159,6 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): redundant_run_skip_handler=redundant_usage_run_skip_handler, ) - self.domain_registry: Optional[DomainRegistry] = None - if self.config.domain: - self.domain_registry = DomainRegistry( - cached_domains=[k for k in self.config.domain], graph=self.ctx.graph - ) - self.profiling_state_handler: Optional[ProfilingHandler] = None if self.config.enable_stateful_profiling: self.profiling_state_handler = ProfilingHandler( @@ -307,17 +171,15 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): config, self.report, self.profiling_state_handler ) - # Global store of table identifiers for lineage filtering - self.table_refs: Set[str] = set() - - # Maps project -> view_ref, so we can find all views in a project - self.view_refs_by_project: Dict[str, Set[str]] = defaultdict(set) - # Maps project -> snapshot_ref, so we can find all snapshots in a project - self.snapshot_refs_by_project: Dict[str, Set[str]] = defaultdict(set) - # Maps view ref -> actual sql - self.view_definitions: FileBackedDict[str] = FileBackedDict() - # Maps snapshot ref -> Snapshot - self.snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot] = FileBackedDict() + self.bq_schema_extractor = BigQuerySchemaGenerator( + self.config, + self.report, + self.bigquery_data_dictionary, + self.domain_registry, + self.sql_parser_schema_resolver, + self.profiler, + self.gen_dataset_urn, + ) self.add_config_to_report() atexit.register(cleanup, config) @@ -327,161 +189,9 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "BigqueryV2Source": config = BigQueryV2Config.parse_obj(config_dict) return cls(ctx, config) - @staticmethod - def connectivity_test(client: bigquery.Client) -> CapabilityReport: - ret = client.query("select 1") - if ret.error_result: - return CapabilityReport( - capable=False, failure_reason=f"{ret.error_result['message']}" - ) - else: - return CapabilityReport(capable=True) - - @property - def store_table_refs(self): - return self.config.include_table_lineage or self.config.include_usage_statistics - - @staticmethod - def metadata_read_capability_test( - project_ids: List[str], config: BigQueryV2Config - ) -> CapabilityReport: - for project_id in project_ids: - try: - logger.info(f"Metadata read capability test for project {project_id}") - client: bigquery.Client = config.get_bigquery_client() - assert client - bigquery_data_dictionary = BigQuerySchemaApi( - BigQueryV2Report().schema_api_perf, client - ) - result = bigquery_data_dictionary.get_datasets_for_project_id( - project_id, 10 - ) - if len(result) == 0: - return CapabilityReport( - capable=False, - failure_reason=f"Dataset query returned empty dataset. It is either empty or no dataset in project {project_id}", - ) - tables = bigquery_data_dictionary.get_tables_for_dataset( - project_id=project_id, - dataset_name=result[0].name, - tables={}, - with_data_read_permission=config.have_table_data_read_permission, - ) - if len(list(tables)) == 0: - return CapabilityReport( - capable=False, - failure_reason=f"Tables query did not return any table. It is either empty or no tables in project {project_id}.{result[0].name}", - ) - - except Exception as e: - return CapabilityReport( - capable=False, - failure_reason=f"Dataset query failed with error: {e}", - ) - - return CapabilityReport(capable=True) - - @staticmethod - def lineage_capability_test( - connection_conf: BigQueryV2Config, - project_ids: List[str], - report: BigQueryV2Report, - ) -> CapabilityReport: - lineage_extractor = BigqueryLineageExtractor( - connection_conf, report, lambda ref: "" - ) - for project_id in project_ids: - try: - logger.info(f"Lineage capability test for project {project_id}") - lineage_extractor.test_capability(project_id) - except Exception as e: - return CapabilityReport( - capable=False, - failure_reason=f"Lineage capability test failed with: {e}", - ) - - return CapabilityReport(capable=True) - - @staticmethod - def usage_capability_test( - connection_conf: BigQueryV2Config, - project_ids: List[str], - report: BigQueryV2Report, - ) -> CapabilityReport: - usage_extractor = BigQueryUsageExtractor( - connection_conf, - report, - schema_resolver=SchemaResolver(platform="bigquery"), - dataset_urn_builder=lambda ref: "", - ) - for project_id in project_ids: - try: - logger.info(f"Usage capability test for project {project_id}") - failures_before_test = len(report.failures) - usage_extractor.test_capability(project_id) - if failures_before_test != len(report.failures): - return CapabilityReport( - capable=False, - failure_reason="Usage capability test failed. Check the logs for further info", - ) - except Exception as e: - return CapabilityReport( - capable=False, - failure_reason=f"Usage capability test failed with: {e} for project {project_id}", - ) - return CapabilityReport(capable=True) - @staticmethod def test_connection(config_dict: dict) -> TestConnectionReport: - test_report = TestConnectionReport() - _report: Dict[Union[SourceCapability, str], CapabilityReport] = dict() - - try: - connection_conf = BigQueryV2Config.parse_obj_allow_extras(config_dict) - client: bigquery.Client = connection_conf.get_bigquery_client() - assert client - - test_report.basic_connectivity = BigqueryV2Source.connectivity_test(client) - - connection_conf.start_time = datetime.now() - connection_conf.end_time = datetime.now() + timedelta(minutes=1) - - report: BigQueryV2Report = BigQueryV2Report() - project_ids: List[str] = [] - projects = client.list_projects() - - for project in projects: - if connection_conf.project_id_pattern.allowed(project.project_id): - project_ids.append(project.project_id) - - metadata_read_capability = BigqueryV2Source.metadata_read_capability_test( - project_ids, connection_conf - ) - if SourceCapability.SCHEMA_METADATA not in _report: - _report[SourceCapability.SCHEMA_METADATA] = metadata_read_capability - - if connection_conf.include_table_lineage: - lineage_capability = BigqueryV2Source.lineage_capability_test( - connection_conf, project_ids, report - ) - if SourceCapability.LINEAGE_COARSE not in _report: - _report[SourceCapability.LINEAGE_COARSE] = lineage_capability - - if connection_conf.include_usage_statistics: - usage_capability = BigqueryV2Source.usage_capability_test( - connection_conf, project_ids, report - ) - if SourceCapability.USAGE_STATS not in _report: - _report[SourceCapability.USAGE_STATS] = usage_capability - - test_report.capability_report = _report - return test_report - - except Exception as e: - test_report.basic_connectivity = CapabilityReport( - capable=False, failure_reason=f"{e}" - ) - return test_report + return BigQueryTestConnection.test_connection(config_dict) def _init_schema_resolver(self) -> SchemaResolver: schema_resolution_required = ( @@ -509,83 +219,6 @@ def _init_schema_resolver(self) -> SchemaResolver: ) return SchemaResolver(platform=self.platform, env=self.config.env) - def get_dataplatform_instance_aspect( - self, dataset_urn: str, project_id: str - ) -> MetadataWorkUnit: - aspect = DataPlatformInstanceClass( - platform=make_data_platform_urn(self.platform), - instance=( - make_dataplatform_instance_urn(self.platform, project_id) - if self.config.include_data_platform_instance - else None - ), - ) - return MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=aspect - ).as_workunit() - - def gen_dataset_key(self, db_name: str, schema: str) -> ContainerKey: - return BigQueryDatasetKey( - project_id=db_name, - dataset_id=schema, - platform=self.platform, - env=self.config.env, - backcompat_env_as_instance=True, - ) - - def gen_project_id_key(self, database: str) -> ContainerKey: - return ProjectIdKey( - project_id=database, - platform=self.platform, - env=self.config.env, - backcompat_env_as_instance=True, - ) - - def gen_project_id_containers(self, database: str) -> Iterable[MetadataWorkUnit]: - database_container_key = self.gen_project_id_key(database) - - yield from gen_database_container( - database=database, - name=database, - sub_types=[DatasetContainerSubTypes.BIGQUERY_PROJECT], - domain_registry=self.domain_registry, - domain_config=self.config.domain, - database_container_key=database_container_key, - ) - - def gen_dataset_containers( - self, dataset: str, project_id: str, tags: Optional[Dict[str, str]] = None - ) -> Iterable[MetadataWorkUnit]: - schema_container_key = self.gen_dataset_key(project_id, dataset) - - tags_joined: Optional[List[str]] = None - if tags and self.config.capture_dataset_label_as_tag: - tags_joined = [ - f"{k}:{v}" - for k, v in tags.items() - if is_tag_allowed(self.config.capture_dataset_label_as_tag, k) - ] - - database_container_key = self.gen_project_id_key(database=project_id) - - yield from gen_schema_container( - database=project_id, - schema=dataset, - sub_types=[DatasetContainerSubTypes.BIGQUERY_DATASET], - domain_registry=self.domain_registry, - domain_config=self.config.domain, - schema_container_key=schema_container_key, - database_container_key=database_container_key, - external_url=( - BQ_EXTERNAL_DATASET_URL_TEMPLATE.format( - project=project_id, dataset=dataset - ) - if self.config.include_external_url - else None - ), - tags=tags_joined, - ) - def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), @@ -603,25 +236,23 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: return if self.config.include_schema_metadata: - for project_id in projects: - self.report.set_ingestion_stage(project_id.id, METADATA_EXTRACTION) - logger.info(f"Processing project: {project_id.id}") - yield from self._process_project(project_id) + for project in projects: + yield from self.bq_schema_extractor.get_project_workunits(project) if self.config.include_usage_statistics: yield from self.usage_extractor.get_usage_workunits( - [p.id for p in projects], self.table_refs + [p.id for p in projects], self.bq_schema_extractor.table_refs ) if self.config.include_table_lineage: yield from self.lineage_extractor.get_lineage_workunits( [p.id for p in projects], self.sql_parser_schema_resolver, - self.view_refs_by_project, - self.view_definitions, - self.snapshot_refs_by_project, - self.snapshots_by_ref, - self.table_refs, + self.bq_schema_extractor.view_refs_by_project, + self.bq_schema_extractor.view_definitions, + self.bq_schema_extractor.snapshot_refs_by_project, + self.bq_schema_extractor.snapshots_by_ref, + self.bq_schema_extractor.table_refs, ) def _get_projects(self) -> List[BigqueryProject]: @@ -636,15 +267,25 @@ def _get_projects(self) -> List[BigqueryProject]: return list(self._query_project_list()) def _query_project_list(self) -> Iterable[BigqueryProject]: - projects = self.bigquery_data_dictionary.get_projects() - if not projects: # Report failure on exception and if empty list is returned - self.report.report_failure( - "metadata-extraction", - "Get projects didn't return any project. " - "Maybe resourcemanager.projects.get permission is missing for the service account. " + try: + projects = self.bigquery_data_dictionary.get_projects() + + if ( + not projects + ): # Report failure on exception and if empty list is returned + self.report.failure( + title="Get projects didn't return any project. ", + message="Maybe resourcemanager.projects.get permission is missing for the service account. " + "You can assign predefined roles/bigquery.metadataViewer role to your service account.", + ) + except Exception as e: + self.report.failure( + title="Failed to get BigQuery Projects", + message="Maybe resourcemanager.projects.get permission is missing for the service account. " "You can assign predefined roles/bigquery.metadataViewer role to your service account.", + exc=e, ) - return + projects = [] for project in projects: if self.config.project_id_pattern.allowed(project.id): @@ -652,567 +293,6 @@ def _query_project_list(self) -> Iterable[BigqueryProject]: else: self.report.report_dropped(project.id) - def _process_project( - self, bigquery_project: BigqueryProject - ) -> Iterable[MetadataWorkUnit]: - db_tables: Dict[str, List[BigqueryTable]] = {} - db_views: Dict[str, List[BigqueryView]] = {} - db_snapshots: Dict[str, List[BigqueryTableSnapshot]] = {} - - project_id = bigquery_project.id - try: - bigquery_project.datasets = ( - self.bigquery_data_dictionary.get_datasets_for_project_id(project_id) - ) - except Exception as e: - error_message = f"Unable to get datasets for project {project_id}, skipping. The error was: {e}" - if self.config.is_profiling_enabled(): - error_message = f"Unable to get datasets for project {project_id}, skipping. Does your service account has bigquery.datasets.get permission? The error was: {e}" - logger.error(error_message) - self.report.report_failure( - "metadata-extraction", - f"{project_id} - {error_message}", - ) - return None - - if len(bigquery_project.datasets) == 0: - more_info = ( - "Either there are no datasets in this project or missing bigquery.datasets.get permission. " - "You can assign predefined roles/bigquery.metadataViewer role to your service account." - ) - if self.config.exclude_empty_projects: - self.report.report_dropped(project_id) - warning_message = f"Excluded project '{project_id}' since no were datasets found. {more_info}" - else: - yield from self.gen_project_id_containers(project_id) - warning_message = ( - f"No datasets found in project '{project_id}'. {more_info}" - ) - logger.warning(warning_message) - return - - yield from self.gen_project_id_containers(project_id) - - self.report.num_project_datasets_to_scan[project_id] = len( - bigquery_project.datasets - ) - for bigquery_dataset in bigquery_project.datasets: - if not is_schema_allowed( - self.config.dataset_pattern, - bigquery_dataset.name, - project_id, - self.config.match_fully_qualified_names, - ): - self.report.report_dropped(f"{bigquery_dataset.name}.*") - continue - try: - # db_tables, db_views, and db_snapshots are populated in the this method - yield from self._process_schema( - project_id, bigquery_dataset, db_tables, db_views, db_snapshots - ) - - except Exception as e: - error_message = f"Unable to get tables for dataset {bigquery_dataset.name} in project {project_id}, skipping. Does your service account has bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission? The error was: {e}" - if self.config.is_profiling_enabled(): - error_message = f"Unable to get tables for dataset {bigquery_dataset.name} in project {project_id}, skipping. Does your service account has bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission, bigquery.tables.getData permission? The error was: {e}" - - trace = traceback.format_exc() - logger.error(trace) - logger.error(error_message) - self.report.report_failure( - "metadata-extraction", - f"{project_id}.{bigquery_dataset.name} - {error_message} - {trace}", - ) - continue - - if self.config.is_profiling_enabled(): - logger.info(f"Starting profiling project {project_id}") - self.report.set_ingestion_stage(project_id, PROFILING) - yield from self.profiler.get_workunits( - project_id=project_id, - tables=db_tables, - ) - - def _process_schema( - self, - project_id: str, - bigquery_dataset: BigqueryDataset, - db_tables: Dict[str, List[BigqueryTable]], - db_views: Dict[str, List[BigqueryView]], - db_snapshots: Dict[str, List[BigqueryTableSnapshot]], - ) -> Iterable[MetadataWorkUnit]: - dataset_name = bigquery_dataset.name - - yield from self.gen_dataset_containers( - dataset_name, project_id, bigquery_dataset.labels - ) - - columns = None - - rate_limiter: Optional[RateLimiter] = None - if self.config.rate_limit: - rate_limiter = RateLimiter( - max_calls=self.config.requests_per_min, period=60 - ) - - if ( - self.config.include_tables - or self.config.include_views - or self.config.include_table_snapshots - ): - columns = self.bigquery_data_dictionary.get_columns_for_dataset( - project_id=project_id, - dataset_name=dataset_name, - column_limit=self.config.column_limit, - run_optimized_column_query=self.config.run_optimized_column_query, - extract_policy_tags_from_catalog=self.config.extract_policy_tags_from_catalog, - report=self.report, - rate_limiter=rate_limiter, - ) - - if self.config.include_tables: - db_tables[dataset_name] = list( - self.get_tables_for_dataset(project_id, dataset_name) - ) - - for table in db_tables[dataset_name]: - table_columns = columns.get(table.name, []) if columns else [] - table_wu_generator = self._process_table( - table=table, - columns=table_columns, - project_id=project_id, - dataset_name=dataset_name, - ) - yield from classification_workunit_processor( - table_wu_generator, - self.classification_handler, - self.data_reader, - [project_id, dataset_name, table.name], - data_reader_kwargs=dict( - sample_size_percent=( - self.config.classification.sample_size - * SAMPLE_SIZE_MULTIPLIER - / table.rows_count - if table.rows_count - else None - ) - ), - ) - elif self.store_table_refs: - # Need table_refs to calculate lineage and usage - for table_item in self.bigquery_data_dictionary.list_tables( - dataset_name, project_id - ): - identifier = BigqueryTableIdentifier( - project_id=project_id, - dataset=dataset_name, - table=table_item.table_id, - ) - if not self.config.table_pattern.allowed(identifier.raw_table_name()): - self.report.report_dropped(identifier.raw_table_name()) - continue - try: - self.table_refs.add( - str(BigQueryTableRef(identifier).get_sanitized_table_ref()) - ) - except Exception as e: - logger.warning( - f"Could not create table ref for {table_item.path}: {e}" - ) - - if self.config.include_views: - db_views[dataset_name] = list( - self.bigquery_data_dictionary.get_views_for_dataset( - project_id, - dataset_name, - self.config.is_profiling_enabled(), - self.report, - ) - ) - - for view in db_views[dataset_name]: - view_columns = columns.get(view.name, []) if columns else [] - yield from self._process_view( - view=view, - columns=view_columns, - project_id=project_id, - dataset_name=dataset_name, - ) - - if self.config.include_table_snapshots: - db_snapshots[dataset_name] = list( - self.bigquery_data_dictionary.get_snapshots_for_dataset( - project_id, - dataset_name, - self.config.is_profiling_enabled(), - self.report, - ) - ) - - for snapshot in db_snapshots[dataset_name]: - snapshot_columns = columns.get(snapshot.name, []) if columns else [] - yield from self._process_snapshot( - snapshot=snapshot, - columns=snapshot_columns, - project_id=project_id, - dataset_name=dataset_name, - ) - - # This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here - # because the profiler doesn't have access to columns - def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]: - ignore_list: List[str] = [] - for column in columns: - if not column.data_type or any( - word in column.data_type.lower() - for word in ["array", "struct", "geography", "json"] - ): - ignore_list.append(column.field_path) - return ignore_list - - def _process_table( - self, - table: BigqueryTable, - columns: List[BigqueryColumn], - project_id: str, - dataset_name: str, - ) -> Iterable[MetadataWorkUnit]: - table_identifier = BigqueryTableIdentifier(project_id, dataset_name, table.name) - - self.report.report_entity_scanned(table_identifier.raw_table_name()) - - if not self.config.table_pattern.allowed(table_identifier.raw_table_name()): - self.report.report_dropped(table_identifier.raw_table_name()) - return - - if self.store_table_refs: - self.table_refs.add( - str(BigQueryTableRef(table_identifier).get_sanitized_table_ref()) - ) - table.column_count = len(columns) - - # We only collect profile ignore list if profiling is enabled and profile_table_level_only is false - if ( - self.config.is_profiling_enabled() - and not self.config.profiling.profile_table_level_only - ): - table.columns_ignore_from_profiling = self.generate_profile_ignore_list( - columns - ) - - if not table.column_count: - logger.warning( - f"Table doesn't have any column or unable to get columns for table: {table_identifier}" - ) - - # If table has time partitioning, set the data type of the partitioning field - if table.partition_info: - table.partition_info.column = next( - ( - column - for column in columns - if column.name == table.partition_info.field - ), - None, - ) - yield from self.gen_table_dataset_workunits( - table, columns, project_id, dataset_name - ) - - def _process_view( - self, - view: BigqueryView, - columns: List[BigqueryColumn], - project_id: str, - dataset_name: str, - ) -> Iterable[MetadataWorkUnit]: - table_identifier = BigqueryTableIdentifier(project_id, dataset_name, view.name) - - self.report.report_entity_scanned(table_identifier.raw_table_name(), "view") - - if not self.config.view_pattern.allowed(table_identifier.raw_table_name()): - self.report.report_dropped(table_identifier.raw_table_name()) - return - - if self.store_table_refs: - table_ref = str( - BigQueryTableRef(table_identifier).get_sanitized_table_ref() - ) - self.table_refs.add(table_ref) - if self.config.lineage_parse_view_ddl and view.view_definition: - self.view_refs_by_project[project_id].add(table_ref) - self.view_definitions[table_ref] = view.view_definition - - view.column_count = len(columns) - if not view.column_count: - logger.warning( - f"View doesn't have any column or unable to get columns for table: {table_identifier}" - ) - - yield from self.gen_view_dataset_workunits( - table=view, - columns=columns, - project_id=project_id, - dataset_name=dataset_name, - ) - - def _process_snapshot( - self, - snapshot: BigqueryTableSnapshot, - columns: List[BigqueryColumn], - project_id: str, - dataset_name: str, - ) -> Iterable[MetadataWorkUnit]: - table_identifier = BigqueryTableIdentifier( - project_id, dataset_name, snapshot.name - ) - - self.report.snapshots_scanned += 1 - - if not self.config.table_snapshot_pattern.allowed( - table_identifier.raw_table_name() - ): - self.report.report_dropped(table_identifier.raw_table_name()) - return - - snapshot.columns = columns - snapshot.column_count = len(columns) - if not snapshot.column_count: - logger.warning( - f"Snapshot doesn't have any column or unable to get columns for table: {table_identifier}" - ) - - if self.store_table_refs: - table_ref = str( - BigQueryTableRef(table_identifier).get_sanitized_table_ref() - ) - self.table_refs.add(table_ref) - if snapshot.base_table_identifier: - self.snapshot_refs_by_project[project_id].add(table_ref) - self.snapshots_by_ref[table_ref] = snapshot - - yield from self.gen_snapshot_dataset_workunits( - table=snapshot, - columns=columns, - project_id=project_id, - dataset_name=dataset_name, - ) - - def gen_table_dataset_workunits( - self, - table: BigqueryTable, - columns: List[BigqueryColumn], - project_id: str, - dataset_name: str, - ) -> Iterable[MetadataWorkUnit]: - custom_properties: Dict[str, str] = {} - if table.expires: - custom_properties["expiration_date"] = str(table.expires) - - if table.partition_info: - custom_properties["partition_info"] = str(table.partition_info) - - if table.size_in_bytes: - custom_properties["size_in_bytes"] = str(table.size_in_bytes) - - if table.active_billable_bytes: - custom_properties["billable_bytes_active"] = str( - table.active_billable_bytes - ) - - if table.long_term_billable_bytes: - custom_properties["billable_bytes_long_term"] = str( - table.long_term_billable_bytes - ) - - if table.max_partition_id: - custom_properties["number_of_partitions"] = str(table.num_partitions) - custom_properties["max_partition_id"] = str(table.max_partition_id) - custom_properties["is_partitioned"] = str(True) - - sub_types: List[str] = [DatasetSubTypes.TABLE] - if table.max_shard_id: - custom_properties["max_shard_id"] = str(table.max_shard_id) - custom_properties["is_sharded"] = str(True) - sub_types = ["sharded table"] + sub_types - - tags_to_add = None - if table.labels and self.config.capture_table_label_as_tag: - tags_to_add = [] - tags_to_add.extend( - [ - make_tag_urn(f"""{k}:{v}""") - for k, v in table.labels.items() - if is_tag_allowed(self.config.capture_table_label_as_tag, k) - ] - ) - - yield from self.gen_dataset_workunits( - table=table, - columns=columns, - project_id=project_id, - dataset_name=dataset_name, - sub_types=sub_types, - tags_to_add=tags_to_add, - custom_properties=custom_properties, - ) - - def gen_view_dataset_workunits( - self, - table: BigqueryView, - columns: List[BigqueryColumn], - project_id: str, - dataset_name: str, - ) -> Iterable[MetadataWorkUnit]: - tags_to_add = None - if table.labels and self.config.capture_view_label_as_tag: - tags_to_add = [ - make_tag_urn(f"{k}:{v}") - for k, v in table.labels.items() - if is_tag_allowed(self.config.capture_view_label_as_tag, k) - ] - yield from self.gen_dataset_workunits( - table=table, - columns=columns, - project_id=project_id, - dataset_name=dataset_name, - tags_to_add=tags_to_add, - sub_types=[DatasetSubTypes.VIEW], - ) - - view = cast(BigqueryView, table) - view_definition_string = view.view_definition - view_properties_aspect = ViewProperties( - materialized=view.materialized, - viewLanguage="SQL", - viewLogic=view_definition_string or "", - ) - yield MetadataChangeProposalWrapper( - entityUrn=self.gen_dataset_urn( - project_id=project_id, dataset_name=dataset_name, table=table.name - ), - aspect=view_properties_aspect, - ).as_workunit() - - def gen_snapshot_dataset_workunits( - self, - table: BigqueryTableSnapshot, - columns: List[BigqueryColumn], - project_id: str, - dataset_name: str, - ) -> Iterable[MetadataWorkUnit]: - custom_properties: Dict[str, str] = {} - if table.ddl: - custom_properties["snapshot_ddl"] = table.ddl - if table.snapshot_time: - custom_properties["snapshot_time"] = str(table.snapshot_time) - if table.size_in_bytes: - custom_properties["size_in_bytes"] = str(table.size_in_bytes) - if table.rows_count: - custom_properties["rows_count"] = str(table.rows_count) - yield from self.gen_dataset_workunits( - table=table, - columns=columns, - project_id=project_id, - dataset_name=dataset_name, - sub_types=[DatasetSubTypes.BIGQUERY_TABLE_SNAPSHOT], - custom_properties=custom_properties, - ) - - def gen_dataset_workunits( - self, - table: Union[BigqueryTable, BigqueryView, BigqueryTableSnapshot], - columns: List[BigqueryColumn], - project_id: str, - dataset_name: str, - sub_types: List[str], - tags_to_add: Optional[List[str]] = None, - custom_properties: Optional[Dict[str, str]] = None, - ) -> Iterable[MetadataWorkUnit]: - dataset_urn = self.gen_dataset_urn( - project_id=project_id, dataset_name=dataset_name, table=table.name - ) - - status = Status(removed=False) - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=status - ).as_workunit() - - datahub_dataset_name = BigqueryTableIdentifier( - project_id, dataset_name, table.name - ) - - yield self.gen_schema_metadata( - dataset_urn, table, columns, datahub_dataset_name - ) - - dataset_properties = DatasetProperties( - name=datahub_dataset_name.get_table_display_name(), - description=( - unquote_and_decode_unicode_escape_seq(table.comment) - if table.comment - else "" - ), - qualifiedName=str(datahub_dataset_name), - created=( - TimeStamp(time=int(table.created.timestamp() * 1000)) - if table.created is not None - else None - ), - lastModified=( - TimeStamp(time=int(table.last_altered.timestamp() * 1000)) - if table.last_altered is not None - else None - ), - externalUrl=( - BQ_EXTERNAL_TABLE_URL_TEMPLATE.format( - project=project_id, dataset=dataset_name, table=table.name - ) - if self.config.include_external_url - else None - ), - ) - if custom_properties: - dataset_properties.customProperties.update(custom_properties) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=dataset_properties - ).as_workunit() - - if tags_to_add: - yield self.gen_tags_aspect_workunit(dataset_urn, tags_to_add) - - yield from add_table_to_schema_container( - dataset_urn=dataset_urn, - parent_container_key=self.gen_dataset_key(project_id, dataset_name), - ) - yield self.get_dataplatform_instance_aspect( - dataset_urn=dataset_urn, project_id=project_id - ) - - subTypes = SubTypes(typeNames=sub_types) - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=subTypes - ).as_workunit() - - if self.domain_registry: - yield from get_domain_wu( - dataset_name=str(datahub_dataset_name), - entity_urn=dataset_urn, - domain_registry=self.domain_registry, - domain_config=self.config.domain, - ) - - def gen_tags_aspect_workunit( - self, dataset_urn: str, tags_to_add: List[str] - ) -> MetadataWorkUnit: - tags = GlobalTagsClass( - tags=[TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add] - ) - return MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=tags - ).as_workunit() - def gen_dataset_urn( self, project_id: str, dataset_name: str, table: str, use_raw_name: bool = False ) -> str: @@ -1235,241 +315,9 @@ def gen_dataset_urn_from_raw_ref(self, ref: BigQueryTableRef) -> str: use_raw_name=True, ) - def gen_dataset_urn_from_ref(self, ref: BigQueryTableRef) -> str: - return self.gen_dataset_urn( - ref.table_identifier.project_id, - ref.table_identifier.dataset, - ref.table_identifier.table, - ) - - def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: - schema_fields: List[SchemaField] = [] - - # Below line affects HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR in global scope - # TODO: Refractor this such that - # converter = HiveColumnToAvroConverter(struct_type_separator=" "); - # converter.get_schema_fields_for_hive_column(...) - original_struct_type_separator = ( - HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR - ) - HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR = " " - _COMPLEX_TYPE = re.compile("^(struct|array)") - last_id = -1 - for col in columns: - # if col.data_type is empty that means this column is part of a complex type - if col.data_type is None or _COMPLEX_TYPE.match(col.data_type.lower()): - # If the we have seen the ordinal position that most probably means we already processed this complex type - if last_id != col.ordinal_position: - schema_fields.extend( - get_schema_fields_for_hive_column( - col.name, col.data_type.lower(), description=col.comment - ) - ) - - # We have to add complex type comments to the correct level - if col.comment: - for idx, field in enumerate(schema_fields): - # Remove all the [version=2.0].[type=struct]. tags to get the field path - if ( - re.sub( - r"\[.*?\]\.", - "", - field.fieldPath.lower(), - 0, - re.MULTILINE, - ) - == col.field_path.lower() - ): - field.description = col.comment - schema_fields[idx] = field - break - else: - tags = [] - if col.is_partition_column: - tags.append( - TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY)) - ) - - if col.cluster_column_position is not None: - tags.append( - TagAssociationClass( - make_tag_urn( - f"{CLUSTERING_COLUMN_TAG}_{col.cluster_column_position}" - ) - ) - ) - - if col.policy_tags: - for policy_tag in col.policy_tags: - tags.append(TagAssociationClass(make_tag_urn(policy_tag))) - field = SchemaField( - fieldPath=col.name, - type=SchemaFieldDataType( - self.BIGQUERY_FIELD_TYPE_MAPPINGS.get(col.data_type, NullType)() - ), - nativeDataType=col.data_type, - description=col.comment, - nullable=col.is_nullable, - globalTags=GlobalTagsClass(tags=tags), - ) - schema_fields.append(field) - last_id = col.ordinal_position - HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR = ( - original_struct_type_separator - ) - return schema_fields - - def gen_schema_metadata( - self, - dataset_urn: str, - table: Union[BigqueryTable, BigqueryView, BigqueryTableSnapshot], - columns: List[BigqueryColumn], - dataset_name: BigqueryTableIdentifier, - ) -> MetadataWorkUnit: - schema_metadata = SchemaMetadata( - schemaName=str(dataset_name), - platform=make_data_platform_urn(self.platform), - version=0, - hash="", - platformSchema=MySqlDDL(tableSchema=""), - # fields=[], - fields=self.gen_schema_fields(columns), - ) - - if self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser: - self.sql_parser_schema_resolver.add_schema_metadata( - dataset_urn, schema_metadata - ) - - return MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=schema_metadata - ).as_workunit() - def get_report(self) -> BigQueryV2Report: return self.report - def get_tables_for_dataset( - self, - project_id: str, - dataset_name: str, - ) -> Iterable[BigqueryTable]: - # In bigquery there is no way to query all tables in a Project id - with PerfTimer() as timer: - # Partitions view throw exception if we try to query partition info for too many tables - # so we have to limit the number of tables we query partition info. - # The conn.list_tables returns table infos that information_schema doesn't contain and this - # way we can merge that info with the queried one. - # https://cloud.google.com/bigquery/docs/information-schema-partitions - max_batch_size: int = ( - self.config.number_of_datasets_process_in_batch - if not self.config.is_profiling_enabled() - else self.config.number_of_datasets_process_in_batch_if_profiling_enabled - ) - - # We get the list of tables in the dataset to get core table properties and to be able to process the tables in batches - # We collect only the latest shards from sharded tables (tables with _YYYYMMDD suffix) and ignore temporary tables - table_items = self.get_core_table_details( - dataset_name, project_id, self.config.temp_table_dataset_prefix - ) - - items_to_get: Dict[str, TableListItem] = {} - for table_item in table_items.keys(): - items_to_get[table_item] = table_items[table_item] - if len(items_to_get) % max_batch_size == 0: - yield from self.bigquery_data_dictionary.get_tables_for_dataset( - project_id, - dataset_name, - items_to_get, - with_data_read_permission=self.config.have_table_data_read_permission, - ) - items_to_get.clear() - - if items_to_get: - yield from self.bigquery_data_dictionary.get_tables_for_dataset( - project_id, - dataset_name, - items_to_get, - with_data_read_permission=self.config.have_table_data_read_permission, - ) - - self.report.metadata_extraction_sec[f"{project_id}.{dataset_name}"] = round( - timer.elapsed_seconds(), 2 - ) - - def get_core_table_details( - self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str - ) -> Dict[str, TableListItem]: - table_items: Dict[str, TableListItem] = {} - # Dict to store sharded table and the last seen max shard id - sharded_tables: Dict[str, TableListItem] = {} - - for table in self.bigquery_data_dictionary.list_tables( - dataset_name, project_id - ): - table_identifier = BigqueryTableIdentifier( - project_id=project_id, - dataset=dataset_name, - table=table.table_id, - ) - - if table.table_type == "VIEW": - if ( - not self.config.include_views - or not self.config.view_pattern.allowed( - table_identifier.raw_table_name() - ) - ): - self.report.report_dropped(table_identifier.raw_table_name()) - continue - else: - if not self.config.table_pattern.allowed( - table_identifier.raw_table_name() - ): - self.report.report_dropped(table_identifier.raw_table_name()) - continue - - _, shard = BigqueryTableIdentifier.get_table_and_shard( - table_identifier.table - ) - table_name = table_identifier.get_table_name().split(".")[-1] - - # Sharded tables look like: table_20220120 - # For sharded tables we only process the latest shard and ignore the rest - # to find the latest shard we iterate over the list of tables and store the maximum shard id - # We only have one special case where the table name is a date `20220110` - # in this case we merge all these tables under dataset name as table name. - # For example some_dataset.20220110 will be turned to some_dataset.some_dataset - # It seems like there are some bigquery user who uses this non-standard way of sharding the tables. - if shard: - if table_name not in sharded_tables: - sharded_tables[table_name] = table - continue - - stored_table_identifier = BigqueryTableIdentifier( - project_id=project_id, - dataset=dataset_name, - table=sharded_tables[table_name].table_id, - ) - _, stored_shard = BigqueryTableIdentifier.get_table_and_shard( - stored_table_identifier.table - ) - # When table is none, we use dataset_name as table_name - assert stored_shard - if stored_shard < shard: - sharded_tables[table_name] = table - continue - elif str(table_identifier).startswith(temp_table_dataset_prefix): - logger.debug(f"Dropping temporary table {table_identifier.table}") - self.report.report_dropped(table_identifier.raw_table_name()) - continue - - table_items[table.table_id] = table - - # Adding maximum shards to the list of tables - table_items.update({value.table_id: value for value in sharded_tables.values()}) - - return table_items - def add_config_to_report(self): self.report.include_table_lineage = self.config.include_table_lineage self.report.use_date_sharded_audit_log_tables = ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py index 75e116773df966..7d2f8ee0e1fd8d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py @@ -66,6 +66,7 @@ def get_exported_bigquery_audit_metadata( rate_limiter = RateLimiter(max_calls=self.requests_per_min, period=60) with self.report.get_exported_log_entries as current_timer: + self.report.num_get_exported_log_entries_api_requests += 1 for dataset in bigquery_audit_metadata_datasets: logger.info( f"Start loading log entries from BigQueryAuditMetadata in {dataset}" @@ -115,6 +116,7 @@ def get_bigquery_log_entries_via_gcp_logging( ) with self.report.list_log_entries as current_timer: + self.report.num_list_log_entries_api_requests += 1 list_entries = client.list_entries( filter_=filter, page_size=log_page_size, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 578c9dddbd2e46..fe961dbd780f6f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -24,6 +24,10 @@ logger = logging.getLogger(__name__) +DEFAULT_BQ_SCHEMA_PARALLELISM = int( + os.getenv("DATAHUB_BIGQUERY_SCHEMA_PARALLELISM", 20) +) + class BigQueryUsageConfig(BaseUsageConfig): _query_log_delay_removed = pydantic_removed_field("query_log_delay") @@ -175,12 +179,12 @@ class BigQueryV2Config( number_of_datasets_process_in_batch: int = Field( hidden_from_docs=True, - default=500, + default=10000, description="Number of table queried in batch when getting metadata. This is a low level config property which should be touched with care.", ) number_of_datasets_process_in_batch_if_profiling_enabled: int = Field( - default=200, + default=1000, description="Number of partitioned table queried in batch when getting metadata. This is a low level config property which should be touched with care. This restriction is needed because we query partitions system view which throws error if we try to touch too many tables.", ) @@ -313,6 +317,12 @@ def have_table_data_read_permission(self) -> bool: hidden_from_schema=True, ) + max_threads_dataset_parallelism: int = Field( + default=DEFAULT_BQ_SCHEMA_PARALLELISM, + description="Number of worker threads to use to parallelize BigQuery Dataset Metadata Extraction." + " Set to 1 to disable.", + ) + @root_validator(skip_on_failure=True) def profile_default_settings(cls, values: Dict) -> Dict: # Extra default SQLAlchemy option for better connection pooling and threading. diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 8a1bf9e5f3d1d6..4cfcc3922ddc3d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -20,20 +20,32 @@ @dataclass class BigQuerySchemaApiPerfReport(Report): - num_list_projects: int = 0 + num_listed_projects: int = 0 num_list_projects_retry_request: int = 0 + num_list_projects_api_requests: int = 0 + num_list_datasets_api_requests: int = 0 + num_get_columns_for_dataset_api_requests: int = 0 + num_get_tables_for_dataset_api_requests: int = 0 + num_list_tables_api_requests: int = 0 + num_get_views_for_dataset_api_requests: int = 0 + num_get_snapshots_for_dataset_api_requests: int = 0 + list_projects: PerfTimer = field(default_factory=PerfTimer) list_datasets: PerfTimer = field(default_factory=PerfTimer) - get_columns_for_dataset: PerfTimer = field(default_factory=PerfTimer) - get_tables_for_dataset: PerfTimer = field(default_factory=PerfTimer) - list_tables: PerfTimer = field(default_factory=PerfTimer) - get_views_for_dataset: PerfTimer = field(default_factory=PerfTimer) - get_snapshots_for_dataset: PerfTimer = field(default_factory=PerfTimer) + + get_columns_for_dataset_sec: float = 0 + get_tables_for_dataset_sec: float = 0 + list_tables_sec: float = 0 + get_views_for_dataset_sec: float = 0 + get_snapshots_for_dataset_sec: float = 0 @dataclass class BigQueryAuditLogApiPerfReport(Report): + num_get_exported_log_entries_api_requests: int = 0 get_exported_log_entries: PerfTimer = field(default_factory=PerfTimer) + + num_list_log_entries_api_requests: int = 0 list_log_entries: PerfTimer = field(default_factory=PerfTimer) @@ -85,7 +97,6 @@ class BigQueryV2Report( num_usage_parsed_log_entries: TopKDict[str, int] = field( default_factory=int_top_k_dict ) - usage_error_count: Dict[str, int] = field(default_factory=int_top_k_dict) num_usage_resources_dropped: int = 0 num_usage_operations_dropped: int = 0 diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py index fe9bbc134a147b..7bb9becfc9a0d0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema.py @@ -24,6 +24,7 @@ BigqueryTableType, ) from datahub.ingestion.source.sql.sql_generic import BaseColumn, BaseTable, BaseView +from datahub.utilities.perf_timer import PerfTimer from datahub.utilities.ratelimiter import RateLimiter logger: logging.Logger = logging.getLogger(__name__) @@ -163,33 +164,31 @@ def _should_retry(exc: BaseException) -> bool: return True with self.report.list_projects: - try: - # Bigquery API has limit in calling project.list request i.e. 2 request per second. - # https://cloud.google.com/bigquery/quotas#api_request_quotas - # Whenever this limit reached an exception occur with msg - # 'Quota exceeded: Your user exceeded quota for concurrent project.lists requests.' - # Hence, added the api request retry of 15 min. - # We already tried adding rate_limit externally, proving max_result and page_size - # to restrict the request calls inside list_project but issue still occured. - projects_iterator = self.bq_client.list_projects( - retry=retry.Retry( - predicate=_should_retry, initial=10, maximum=180, timeout=900 - ) + self.report.num_list_projects_api_requests += 1 + # Bigquery API has limit in calling project.list request i.e. 2 request per second. + # https://cloud.google.com/bigquery/quotas#api_request_quotas + # Whenever this limit reached an exception occur with msg + # 'Quota exceeded: Your user exceeded quota for concurrent project.lists requests.' + # Hence, added the api request retry of 15 min. + # We already tried adding rate_limit externally, proving max_result and page_size + # to restrict the request calls inside list_project but issue still occured. + projects_iterator = self.bq_client.list_projects( + retry=retry.Retry( + predicate=_should_retry, initial=10, maximum=180, timeout=900 ) - projects: List[BigqueryProject] = [ - BigqueryProject(id=p.project_id, name=p.friendly_name) - for p in projects_iterator - ] - self.report.num_list_projects = len(projects) - return projects - except Exception as e: - logger.error(f"Error getting projects. {e}", exc_info=True) - return [] + ) + projects: List[BigqueryProject] = [ + BigqueryProject(id=p.project_id, name=p.friendly_name) + for p in projects_iterator + ] + self.report.num_listed_projects = len(projects) + return projects def get_datasets_for_project_id( self, project_id: str, maxResults: Optional[int] = None ) -> List[BigqueryDataset]: with self.report.list_datasets: + self.report.num_list_datasets_api_requests += 1 datasets = self.bq_client.list_datasets(project_id, max_results=maxResults) return [ BigqueryDataset(name=d.dataset_id, labels=d.labels) for d in datasets @@ -222,50 +221,42 @@ def get_datasets_for_project_id_with_information_schema( def list_tables( self, dataset_name: str, project_id: str ) -> Iterator[TableListItem]: - with self.report.list_tables as current_timer: + with PerfTimer() as current_timer: for table in self.bq_client.list_tables(f"{project_id}.{dataset_name}"): with current_timer.pause(): yield table + self.report.num_list_tables_api_requests += 1 + self.report.list_tables_sec += current_timer.elapsed_seconds() def get_tables_for_dataset( self, project_id: str, dataset_name: str, tables: Dict[str, TableListItem], + report: BigQueryV2Report, with_data_read_permission: bool = False, - report: Optional[BigQueryV2Report] = None, ) -> Iterator[BigqueryTable]: - with self.report.get_tables_for_dataset as current_timer: + with PerfTimer() as current_timer: filter_clause: str = ", ".join(f"'{table}'" for table in tables.keys()) if with_data_read_permission: - # Tables are ordered by name and table suffix to make sure we always process the latest sharded table - # and skip the others. Sharded tables are tables with suffix _20220102 - cur = self.get_query_result( - BigqueryQuery.tables_for_dataset.format( - project_id=project_id, - dataset_name=dataset_name, - table_filter=( - f" and t.table_name in ({filter_clause})" - if filter_clause - else "" - ), - ), - ) + query_template = BigqueryQuery.tables_for_dataset else: - # Tables are ordered by name and table suffix to make sure we always process the latest sharded table - # and skip the others. Sharded tables are tables with suffix _20220102 - cur = self.get_query_result( - BigqueryQuery.tables_for_dataset_without_partition_data.format( - project_id=project_id, - dataset_name=dataset_name, - table_filter=( - f" and t.table_name in ({filter_clause})" - if filter_clause - else "" - ), + query_template = BigqueryQuery.tables_for_dataset_without_partition_data + + # Tables are ordered by name and table suffix to make sure we always process the latest sharded table + # and skip the others. Sharded tables are tables with suffix _20220102 + cur = self.get_query_result( + query_template.format( + project_id=project_id, + dataset_name=dataset_name, + table_filter=( + f" and t.table_name in ({filter_clause})" + if filter_clause + else "" ), - ) + ), + ) for table in cur: try: @@ -275,15 +266,14 @@ def get_tables_for_dataset( ) except Exception as e: table_name = f"{project_id}.{dataset_name}.{table.table_name}" - logger.warning( - f"Error while processing table {table_name}", - exc_info=True, + report.warning( + title="Failed to process table", + message="Error encountered while processing table", + context=table_name, + exc=e, ) - if report: - report.report_warning( - "metadata-extraction", - f"Failed to get table {table_name}: {e}", - ) + self.report.num_get_tables_for_dataset_api_requests += 1 + self.report.get_tables_for_dataset_sec += current_timer.elapsed_seconds() @staticmethod def _make_bigquery_table( @@ -332,7 +322,7 @@ def get_views_for_dataset( has_data_read: bool, report: BigQueryV2Report, ) -> Iterator[BigqueryView]: - with self.report.get_views_for_dataset as current_timer: + with PerfTimer() as current_timer: if has_data_read: # If profiling is enabled cur = self.get_query_result( @@ -353,14 +343,14 @@ def get_views_for_dataset( yield BigQuerySchemaApi._make_bigquery_view(table) except Exception as e: view_name = f"{project_id}.{dataset_name}.{table.table_name}" - logger.warning( - f"Error while processing view {view_name}", - exc_info=True, - ) - report.report_warning( - "metadata-extraction", - f"Failed to get view {view_name}: {e}", + report.warning( + title="Failed to process view", + message="Error encountered while processing view", + context=view_name, + exc=e, ) + self.report.num_get_views_for_dataset_api_requests += 1 + self.report.get_views_for_dataset_sec += current_timer.elapsed_seconds() @staticmethod def _make_bigquery_view(view: bigquery.Row) -> BigqueryView: @@ -416,22 +406,18 @@ def get_policy_tags_for_column( ) yield policy_tag.display_name except Exception as e: - logger.warning( - f"Unexpected error when retrieving policy tag {policy_tag_name} for column {column_name} in table {table_name}: {e}", - exc_info=True, - ) - report.report_warning( - "metadata-extraction", - f"Failed to retrieve policy tag {policy_tag_name} for column {column_name} in table {table_name} due to unexpected error: {e}", + report.warning( + title="Failed to retrieve policy tag", + message="Unexpected error when retrieving policy tag for column", + context=f"policy tag {policy_tag_name} for column {column_name} in table {table_ref}", + exc=e, ) except Exception as e: - logger.error( - f"Unexpected error retrieving schema for table {table_name} in dataset {dataset_name}, project {project_id}: {e}", - exc_info=True, - ) - report.report_warning( - "metadata-extraction", - f"Failed to retrieve schema for table {table_name} in dataset {dataset_name}, project {project_id} due to unexpected error: {e}", + report.warning( + title="Failed to retrieve policy tag for table", + message="Unexpected error retrieving policy tag for table", + context=table_ref, + exc=e, ) def get_columns_for_dataset( @@ -445,7 +431,7 @@ def get_columns_for_dataset( rate_limiter: Optional[RateLimiter] = None, ) -> Optional[Dict[str, List[BigqueryColumn]]]: columns: Dict[str, List[BigqueryColumn]] = defaultdict(list) - with self.report.get_columns_for_dataset: + with PerfTimer() as timer: try: cur = self.get_query_result( ( @@ -461,89 +447,57 @@ def get_columns_for_dataset( ), ) except Exception as e: - logger.warning(f"Columns for dataset query failed with exception: {e}") - # Error - Information schema query returned too much data. - # Please repeat query with more selective predicates. + report.warning( + title="Failed to retrieve columns for dataset", + message="Query to get columns for dataset failed with exception", + context=f"{project_id}.{dataset_name}", + exc=e, + ) return None last_seen_table: str = "" for column in cur: - if ( - column_limit - and column.table_name in columns - and len(columns[column.table_name]) >= column_limit - ): - if last_seen_table != column.table_name: - logger.warning( - f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns" - ) - last_seen_table = column.table_name - else: - columns[column.table_name].append( - BigqueryColumn( - name=column.column_name, - ordinal_position=column.ordinal_position, - field_path=column.field_path, - is_nullable=column.is_nullable == "YES", - data_type=column.data_type, - comment=column.comment, - is_partition_column=column.is_partitioning_column == "YES", - cluster_column_position=column.clustering_ordinal_position, - policy_tags=( - list( - self.get_policy_tags_for_column( - project_id, - dataset_name, - column.table_name, - column.column_name, - report, - rate_limiter, + with timer.pause(): + if ( + column_limit + and column.table_name in columns + and len(columns[column.table_name]) >= column_limit + ): + if last_seen_table != column.table_name: + logger.warning( + f"{project_id}.{dataset_name}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns" + ) + last_seen_table = column.table_name + else: + columns[column.table_name].append( + BigqueryColumn( + name=column.column_name, + ordinal_position=column.ordinal_position, + field_path=column.field_path, + is_nullable=column.is_nullable == "YES", + data_type=column.data_type, + comment=column.comment, + is_partition_column=column.is_partitioning_column + == "YES", + cluster_column_position=column.clustering_ordinal_position, + policy_tags=( + list( + self.get_policy_tags_for_column( + project_id, + dataset_name, + column.table_name, + column.column_name, + report, + rate_limiter, + ) ) - ) - if extract_policy_tags_from_catalog - else [] - ), + if extract_policy_tags_from_catalog + else [] + ), + ) ) - ) - - return columns - - # This is not used anywhere - def get_columns_for_table( - self, - table_identifier: BigqueryTableIdentifier, - column_limit: Optional[int], - ) -> List[BigqueryColumn]: - cur = self.get_query_result( - BigqueryQuery.columns_for_table.format(table_identifier=table_identifier), - ) - - columns: List[BigqueryColumn] = [] - last_seen_table: str = "" - for column in cur: - if ( - column_limit - and column.table_name in columns - and len(columns[column.table_name]) >= column_limit - ): - if last_seen_table != column.table_name: - logger.warning( - f"{table_identifier.project_id}.{table_identifier.dataset}.{column.table_name} contains more than {column_limit} columns, only processing {column_limit} columns" - ) - else: - columns.append( - BigqueryColumn( - name=column.column_name, - ordinal_position=column.ordinal_position, - is_nullable=column.is_nullable == "YES", - field_path=column.field_path, - data_type=column.data_type, - comment=column.comment, - is_partition_column=column.is_partitioning_column == "YES", - cluster_column_position=column.clustering_ordinal_position, - ) - ) - last_seen_table = column.table_name + self.report.num_get_columns_for_dataset_api_requests += 1 + self.report.get_columns_for_dataset_sec += timer.elapsed_seconds() return columns @@ -554,7 +508,7 @@ def get_snapshots_for_dataset( has_data_read: bool, report: BigQueryV2Report, ) -> Iterator[BigqueryTableSnapshot]: - with self.report.get_snapshots_for_dataset as current_timer: + with PerfTimer() as current_timer: if has_data_read: # If profiling is enabled cur = self.get_query_result( @@ -575,14 +529,14 @@ def get_snapshots_for_dataset( yield BigQuerySchemaApi._make_bigquery_table_snapshot(table) except Exception as e: snapshot_name = f"{project_id}.{dataset_name}.{table.table_name}" - logger.warning( - f"Error while processing view {snapshot_name}", - exc_info=True, - ) report.report_warning( - "metadata-extraction", - f"Failed to get view {snapshot_name}: {e}", + title="Failed to process snapshot", + message="Error encountered while processing snapshot", + context=snapshot_name, + exc=e, ) + self.report.num_get_snapshots_for_dataset_api_requests += 1 + self.report.get_snapshots_for_dataset_sec += current_timer.elapsed_seconds() @staticmethod def _make_bigquery_table_snapshot(snapshot: bigquery.Row) -> BigqueryTableSnapshot: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py new file mode 100644 index 00000000000000..3ffcb225db1c24 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -0,0 +1,1090 @@ +import logging +import re +from collections import defaultdict +from typing import Callable, Dict, Iterable, List, Optional, Set, Type, Union, cast + +from google.cloud.bigquery.table import TableListItem + +from datahub.configuration.pattern_utils import is_schema_allowed, is_tag_allowed +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataplatform_instance_urn, + make_tag_urn, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.glossary.classification_mixin import ( + SAMPLE_SIZE_MULTIPLIER, + ClassificationHandler, + classification_workunit_processor, +) +from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( + BigqueryTableIdentifier, + BigQueryTableRef, +) +from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_data_reader import BigQueryDataReader +from datahub.ingestion.source.bigquery_v2.bigquery_helper import ( + unquote_and_decode_unicode_escape_seq, +) +from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report +from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( + BigqueryColumn, + BigqueryDataset, + BigqueryProject, + BigQuerySchemaApi, + BigqueryTable, + BigqueryTableSnapshot, + BigqueryView, +) +from datahub.ingestion.source.bigquery_v2.common import ( + BQ_EXTERNAL_DATASET_URL_TEMPLATE, + BQ_EXTERNAL_TABLE_URL_TEMPLATE, +) +from datahub.ingestion.source.bigquery_v2.profiler import BigqueryProfiler +from datahub.ingestion.source.common.subtypes import ( + DatasetContainerSubTypes, + DatasetSubTypes, +) +from datahub.ingestion.source.sql.sql_utils import ( + add_table_to_schema_container, + gen_database_container, + gen_schema_container, + get_domain_wu, +) +from datahub.ingestion.source_report.ingestion_stage import ( + METADATA_EXTRACTION, + PROFILING, +) +from datahub.metadata.com.linkedin.pegasus2avro.common import ( + Status, + SubTypes, + TimeStamp, +) +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + DatasetProperties, + ViewProperties, +) +from datahub.metadata.com.linkedin.pegasus2avro.schema import ( + ArrayType, + BooleanType, + BytesType, + DateType, + MySqlDDL, + NullType, + NumberType, + RecordType, + SchemaField, + SchemaFieldDataType, + SchemaMetadata, + StringType, + TimeType, +) +from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, + GlobalTagsClass, + TagAssociationClass, +) +from datahub.sql_parsing.schema_resolver import SchemaResolver +from datahub.utilities.file_backed_collections import FileBackedDict +from datahub.utilities.hive_schema_to_avro import ( + HiveColumnToAvroConverter, + get_schema_fields_for_hive_column, +) +from datahub.utilities.mapping import Constants +from datahub.utilities.perf_timer import PerfTimer +from datahub.utilities.ratelimiter import RateLimiter +from datahub.utilities.registries.domain_registry import DomainRegistry +from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor + +logger: logging.Logger = logging.getLogger(__name__) +# Handle table snapshots +# See https://cloud.google.com/bigquery/docs/table-snapshots-intro. +SNAPSHOT_TABLE_REGEX = re.compile(r"^(.+)@(\d{13})$") +CLUSTERING_COLUMN_TAG = "CLUSTERING_COLUMN" + + +class BigQuerySchemaGenerator: + # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types + # Note: We use the hive schema parser to parse nested BigQuery types. We also have + # some extra type mappings in that file. + BIGQUERY_FIELD_TYPE_MAPPINGS: Dict[ + str, + Type[ + Union[ + ArrayType, + BytesType, + BooleanType, + NumberType, + RecordType, + StringType, + TimeType, + DateType, + NullType, + ] + ], + ] = { + "BYTES": BytesType, + "BOOL": BooleanType, + "INT": NumberType, + "INT64": NumberType, + "SMALLINT": NumberType, + "INTEGER": NumberType, + "BIGINT": NumberType, + "TINYINT": NumberType, + "BYTEINT": NumberType, + "STRING": StringType, + "TIME": TimeType, + "TIMESTAMP": TimeType, + "DATE": DateType, + "DATETIME": TimeType, + "GEOGRAPHY": NullType, + "JSON": RecordType, + "INTERVAL": NullType, + "ARRAY": ArrayType, + "STRUCT": RecordType, + } + + def __init__( + self, + config: BigQueryV2Config, + report: BigQueryV2Report, + bigquery_data_dictionary: BigQuerySchemaApi, + domain_registry: Optional[DomainRegistry], + sql_parser_schema_resolver: SchemaResolver, + profiler: BigqueryProfiler, + dataset_urn_builder: Callable[[str, str, str], str], + ): + self.config = config + self.report = report + self.bigquery_data_dictionary = bigquery_data_dictionary + self.domain_registry = domain_registry + self.sql_parser_schema_resolver = sql_parser_schema_resolver + self.profiler = profiler + self.gen_dataset_urn = dataset_urn_builder + self.platform: str = "bigquery" + + self.classification_handler = ClassificationHandler(self.config, self.report) + self.data_reader: Optional[BigQueryDataReader] = None + if self.classification_handler.is_classification_enabled(): + self.data_reader = BigQueryDataReader.create( + self.config.get_bigquery_client() + ) + + # Global store of table identifiers for lineage filtering + self.table_refs: Set[str] = set() + + # Maps project -> view_ref, so we can find all views in a project + self.view_refs_by_project: Dict[str, Set[str]] = defaultdict(set) + # Maps project -> snapshot_ref, so we can find all snapshots in a project + self.snapshot_refs_by_project: Dict[str, Set[str]] = defaultdict(set) + # Maps view ref -> actual sql + self.view_definitions: FileBackedDict[str] = FileBackedDict() + # Maps snapshot ref -> Snapshot + self.snapshots_by_ref: FileBackedDict[BigqueryTableSnapshot] = FileBackedDict() + + @property + def store_table_refs(self): + return self.config.include_table_lineage or self.config.include_usage_statistics + + def get_project_workunits( + self, project: BigqueryProject + ) -> Iterable[MetadataWorkUnit]: + self.report.set_ingestion_stage(project.id, METADATA_EXTRACTION) + logger.info(f"Processing project: {project.id}") + yield from self._process_project(project) + + def get_dataplatform_instance_aspect( + self, dataset_urn: str, project_id: str + ) -> MetadataWorkUnit: + aspect = DataPlatformInstanceClass( + platform=make_data_platform_urn(self.platform), + instance=( + make_dataplatform_instance_urn(self.platform, project_id) + if self.config.include_data_platform_instance + else None + ), + ) + return MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=aspect + ).as_workunit() + + def gen_dataset_key(self, db_name: str, schema: str) -> ContainerKey: + return BigQueryDatasetKey( + project_id=db_name, + dataset_id=schema, + platform=self.platform, + env=self.config.env, + backcompat_env_as_instance=True, + ) + + def gen_project_id_key(self, database: str) -> ContainerKey: + return ProjectIdKey( + project_id=database, + platform=self.platform, + env=self.config.env, + backcompat_env_as_instance=True, + ) + + def gen_project_id_containers(self, database: str) -> Iterable[MetadataWorkUnit]: + database_container_key = self.gen_project_id_key(database) + + yield from gen_database_container( + database=database, + name=database, + sub_types=[DatasetContainerSubTypes.BIGQUERY_PROJECT], + domain_registry=self.domain_registry, + domain_config=self.config.domain, + database_container_key=database_container_key, + ) + + def gen_dataset_containers( + self, dataset: str, project_id: str, tags: Optional[Dict[str, str]] = None + ) -> Iterable[MetadataWorkUnit]: + schema_container_key = self.gen_dataset_key(project_id, dataset) + + tags_joined: Optional[List[str]] = None + if tags and self.config.capture_dataset_label_as_tag: + tags_joined = [ + f"{k}:{v}" + for k, v in tags.items() + if is_tag_allowed(self.config.capture_dataset_label_as_tag, k) + ] + + database_container_key = self.gen_project_id_key(database=project_id) + + yield from gen_schema_container( + database=project_id, + schema=dataset, + sub_types=[DatasetContainerSubTypes.BIGQUERY_DATASET], + domain_registry=self.domain_registry, + domain_config=self.config.domain, + schema_container_key=schema_container_key, + database_container_key=database_container_key, + external_url=( + BQ_EXTERNAL_DATASET_URL_TEMPLATE.format( + project=project_id, dataset=dataset + ) + if self.config.include_external_url + else None + ), + tags=tags_joined, + ) + + def _process_project( + self, bigquery_project: BigqueryProject + ) -> Iterable[MetadataWorkUnit]: + db_tables: Dict[str, List[BigqueryTable]] = {} + + project_id = bigquery_project.id + try: + bigquery_project.datasets = ( + self.bigquery_data_dictionary.get_datasets_for_project_id(project_id) + ) + except Exception as e: + + if ( + self.config.project_id or self.config.project_ids + ) and "not enabled BigQuery." in str(e): + action_mesage = ( + "The project has not enabled BigQuery API. " + "Did you mistype project id in recipe ?" + ) + else: + action_mesage = ( + "Does your service account have `bigquery.datasets.get` permission ? " + "Assign predefined role `roles/bigquery.metadataViewer` to your service account." + ) + + self.report.failure( + title="Unable to get datasets for project", + message=action_mesage, + context=project_id, + exc=e, + ) + return None + + if len(bigquery_project.datasets) == 0: + action_message = ( + "Either there are no datasets in this project or missing `bigquery.datasets.get` permission. " + "You can assign predefined roles/bigquery.metadataViewer role to your service account." + ) + if self.config.exclude_empty_projects: + self.report.report_dropped(project_id) + logger.info( + f"Excluded project '{project_id}' since no datasets were found. {action_message}" + ) + else: + yield from self.gen_project_id_containers(project_id) + self.report.warning( + title="No datasets found in project", + message=action_message, + context=project_id, + ) + return + + yield from self.gen_project_id_containers(project_id) + + self.report.num_project_datasets_to_scan[project_id] = len( + bigquery_project.datasets + ) + yield from self._process_project_datasets(bigquery_project, db_tables) + + if self.config.is_profiling_enabled(): + logger.info(f"Starting profiling project {project_id}") + self.report.set_ingestion_stage(project_id, PROFILING) + yield from self.profiler.get_workunits( + project_id=project_id, + tables=db_tables, + ) + + def _process_project_datasets( + self, + bigquery_project: BigqueryProject, + db_tables: Dict[str, List[BigqueryTable]], + ) -> Iterable[MetadataWorkUnit]: + + db_views: Dict[str, List[BigqueryView]] = {} + db_snapshots: Dict[str, List[BigqueryTableSnapshot]] = {} + project_id = bigquery_project.id + + def _process_schema_worker( + bigquery_dataset: BigqueryDataset, + ) -> Iterable[MetadataWorkUnit]: + if not is_schema_allowed( + self.config.dataset_pattern, + bigquery_dataset.name, + project_id, + self.config.match_fully_qualified_names, + ): + self.report.report_dropped(f"{bigquery_dataset.name}.*") + return + try: + # db_tables, db_views, and db_snapshots are populated in the this method + for wu in self._process_schema( + project_id, bigquery_dataset, db_tables, db_views, db_snapshots + ): + yield wu + except Exception as e: + if self.config.is_profiling_enabled(): + action_mesage = "Does your service account has bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission, bigquery.tables.getData permission?" + else: + action_mesage = "Does your service account has bigquery.tables.list, bigquery.routines.get, bigquery.routines.list permission?" + + self.report.failure( + title="Unable to get tables for dataset", + message=action_mesage, + context=f"{project_id}.{bigquery_dataset.name}", + exc=e, + ) + + for wu in ThreadedIteratorExecutor.process( + worker_func=_process_schema_worker, + args_list=[(bq_dataset,) for bq_dataset in bigquery_project.datasets], + max_workers=self.config.max_threads_dataset_parallelism, + ): + yield wu + + def _process_schema( + self, + project_id: str, + bigquery_dataset: BigqueryDataset, + db_tables: Dict[str, List[BigqueryTable]], + db_views: Dict[str, List[BigqueryView]], + db_snapshots: Dict[str, List[BigqueryTableSnapshot]], + ) -> Iterable[MetadataWorkUnit]: + dataset_name = bigquery_dataset.name + + yield from self.gen_dataset_containers( + dataset_name, project_id, bigquery_dataset.labels + ) + + columns = None + + rate_limiter: Optional[RateLimiter] = None + if self.config.rate_limit: + rate_limiter = RateLimiter( + max_calls=self.config.requests_per_min, period=60 + ) + + if ( + self.config.include_tables + or self.config.include_views + or self.config.include_table_snapshots + ): + columns = self.bigquery_data_dictionary.get_columns_for_dataset( + project_id=project_id, + dataset_name=dataset_name, + column_limit=self.config.column_limit, + run_optimized_column_query=self.config.run_optimized_column_query, + extract_policy_tags_from_catalog=self.config.extract_policy_tags_from_catalog, + report=self.report, + rate_limiter=rate_limiter, + ) + + if self.config.include_tables: + db_tables[dataset_name] = list( + self.get_tables_for_dataset(project_id, dataset_name) + ) + + for table in db_tables[dataset_name]: + table_columns = columns.get(table.name, []) if columns else [] + table_wu_generator = self._process_table( + table=table, + columns=table_columns, + project_id=project_id, + dataset_name=dataset_name, + ) + yield from classification_workunit_processor( + table_wu_generator, + self.classification_handler, + self.data_reader, + [project_id, dataset_name, table.name], + data_reader_kwargs=dict( + sample_size_percent=( + self.config.classification.sample_size + * SAMPLE_SIZE_MULTIPLIER + / table.rows_count + if table.rows_count + else None + ) + ), + ) + elif self.store_table_refs: + # Need table_refs to calculate lineage and usage + for table_item in self.bigquery_data_dictionary.list_tables( + dataset_name, project_id + ): + identifier = BigqueryTableIdentifier( + project_id=project_id, + dataset=dataset_name, + table=table_item.table_id, + ) + if not self.config.table_pattern.allowed(identifier.raw_table_name()): + self.report.report_dropped(identifier.raw_table_name()) + continue + try: + self.table_refs.add( + str(BigQueryTableRef(identifier).get_sanitized_table_ref()) + ) + except Exception as e: + logger.warning( + f"Could not create table ref for {table_item.path}: {e}" + ) + + if self.config.include_views: + db_views[dataset_name] = list( + self.bigquery_data_dictionary.get_views_for_dataset( + project_id, + dataset_name, + self.config.is_profiling_enabled(), + self.report, + ) + ) + + for view in db_views[dataset_name]: + view_columns = columns.get(view.name, []) if columns else [] + yield from self._process_view( + view=view, + columns=view_columns, + project_id=project_id, + dataset_name=dataset_name, + ) + + if self.config.include_table_snapshots: + db_snapshots[dataset_name] = list( + self.bigquery_data_dictionary.get_snapshots_for_dataset( + project_id, + dataset_name, + self.config.is_profiling_enabled(), + self.report, + ) + ) + + for snapshot in db_snapshots[dataset_name]: + snapshot_columns = columns.get(snapshot.name, []) if columns else [] + yield from self._process_snapshot( + snapshot=snapshot, + columns=snapshot_columns, + project_id=project_id, + dataset_name=dataset_name, + ) + + # This method is used to generate the ignore list for datatypes the profiler doesn't support we have to do it here + # because the profiler doesn't have access to columns + def generate_profile_ignore_list(self, columns: List[BigqueryColumn]) -> List[str]: + ignore_list: List[str] = [] + for column in columns: + if not column.data_type or any( + word in column.data_type.lower() + for word in ["array", "struct", "geography", "json"] + ): + ignore_list.append(column.field_path) + return ignore_list + + def _process_table( + self, + table: BigqueryTable, + columns: List[BigqueryColumn], + project_id: str, + dataset_name: str, + ) -> Iterable[MetadataWorkUnit]: + table_identifier = BigqueryTableIdentifier(project_id, dataset_name, table.name) + + self.report.report_entity_scanned(table_identifier.raw_table_name()) + + if not self.config.table_pattern.allowed(table_identifier.raw_table_name()): + self.report.report_dropped(table_identifier.raw_table_name()) + return + + if self.store_table_refs: + self.table_refs.add( + str(BigQueryTableRef(table_identifier).get_sanitized_table_ref()) + ) + table.column_count = len(columns) + + # We only collect profile ignore list if profiling is enabled and profile_table_level_only is false + if ( + self.config.is_profiling_enabled() + and not self.config.profiling.profile_table_level_only + ): + table.columns_ignore_from_profiling = self.generate_profile_ignore_list( + columns + ) + + if not table.column_count: + logger.warning( + f"Table doesn't have any column or unable to get columns for table: {table_identifier}" + ) + + # If table has time partitioning, set the data type of the partitioning field + if table.partition_info: + table.partition_info.column = next( + ( + column + for column in columns + if column.name == table.partition_info.field + ), + None, + ) + yield from self.gen_table_dataset_workunits( + table, columns, project_id, dataset_name + ) + + def _process_view( + self, + view: BigqueryView, + columns: List[BigqueryColumn], + project_id: str, + dataset_name: str, + ) -> Iterable[MetadataWorkUnit]: + table_identifier = BigqueryTableIdentifier(project_id, dataset_name, view.name) + + self.report.report_entity_scanned(table_identifier.raw_table_name(), "view") + + if not self.config.view_pattern.allowed(table_identifier.raw_table_name()): + self.report.report_dropped(table_identifier.raw_table_name()) + return + + if self.store_table_refs: + table_ref = str( + BigQueryTableRef(table_identifier).get_sanitized_table_ref() + ) + self.table_refs.add(table_ref) + if self.config.lineage_parse_view_ddl and view.view_definition: + self.view_refs_by_project[project_id].add(table_ref) + self.view_definitions[table_ref] = view.view_definition + + view.column_count = len(columns) + if not view.column_count: + logger.warning( + f"View doesn't have any column or unable to get columns for view: {table_identifier}" + ) + + yield from self.gen_view_dataset_workunits( + table=view, + columns=columns, + project_id=project_id, + dataset_name=dataset_name, + ) + + def _process_snapshot( + self, + snapshot: BigqueryTableSnapshot, + columns: List[BigqueryColumn], + project_id: str, + dataset_name: str, + ) -> Iterable[MetadataWorkUnit]: + table_identifier = BigqueryTableIdentifier( + project_id, dataset_name, snapshot.name + ) + + self.report.snapshots_scanned += 1 + + if not self.config.table_snapshot_pattern.allowed( + table_identifier.raw_table_name() + ): + self.report.report_dropped(table_identifier.raw_table_name()) + return + + snapshot.columns = columns + snapshot.column_count = len(columns) + if not snapshot.column_count: + logger.warning( + f"Snapshot doesn't have any column or unable to get columns for snapshot: {table_identifier}" + ) + + if self.store_table_refs: + table_ref = str( + BigQueryTableRef(table_identifier).get_sanitized_table_ref() + ) + self.table_refs.add(table_ref) + if snapshot.base_table_identifier: + self.snapshot_refs_by_project[project_id].add(table_ref) + self.snapshots_by_ref[table_ref] = snapshot + + yield from self.gen_snapshot_dataset_workunits( + table=snapshot, + columns=columns, + project_id=project_id, + dataset_name=dataset_name, + ) + + def gen_table_dataset_workunits( + self, + table: BigqueryTable, + columns: List[BigqueryColumn], + project_id: str, + dataset_name: str, + ) -> Iterable[MetadataWorkUnit]: + custom_properties: Dict[str, str] = {} + if table.expires: + custom_properties["expiration_date"] = str(table.expires) + + if table.partition_info: + custom_properties["partition_info"] = str(table.partition_info) + + if table.size_in_bytes: + custom_properties["size_in_bytes"] = str(table.size_in_bytes) + + if table.active_billable_bytes: + custom_properties["billable_bytes_active"] = str( + table.active_billable_bytes + ) + + if table.long_term_billable_bytes: + custom_properties["billable_bytes_long_term"] = str( + table.long_term_billable_bytes + ) + + if table.max_partition_id: + custom_properties["number_of_partitions"] = str(table.num_partitions) + custom_properties["max_partition_id"] = str(table.max_partition_id) + custom_properties["is_partitioned"] = str(True) + + sub_types: List[str] = [DatasetSubTypes.TABLE] + if table.max_shard_id: + custom_properties["max_shard_id"] = str(table.max_shard_id) + custom_properties["is_sharded"] = str(True) + sub_types = ["sharded table"] + sub_types + + tags_to_add = None + if table.labels and self.config.capture_table_label_as_tag: + tags_to_add = [] + tags_to_add.extend( + [ + make_tag_urn(f"""{k}:{v}""") + for k, v in table.labels.items() + if is_tag_allowed(self.config.capture_table_label_as_tag, k) + ] + ) + + yield from self.gen_dataset_workunits( + table=table, + columns=columns, + project_id=project_id, + dataset_name=dataset_name, + sub_types=sub_types, + tags_to_add=tags_to_add, + custom_properties=custom_properties, + ) + + def gen_view_dataset_workunits( + self, + table: BigqueryView, + columns: List[BigqueryColumn], + project_id: str, + dataset_name: str, + ) -> Iterable[MetadataWorkUnit]: + tags_to_add = None + if table.labels and self.config.capture_view_label_as_tag: + tags_to_add = [ + make_tag_urn(f"{k}:{v}") + for k, v in table.labels.items() + if is_tag_allowed(self.config.capture_view_label_as_tag, k) + ] + yield from self.gen_dataset_workunits( + table=table, + columns=columns, + project_id=project_id, + dataset_name=dataset_name, + tags_to_add=tags_to_add, + sub_types=[DatasetSubTypes.VIEW], + ) + + view = cast(BigqueryView, table) + view_definition_string = view.view_definition + view_properties_aspect = ViewProperties( + materialized=view.materialized, + viewLanguage="SQL", + viewLogic=view_definition_string or "", + ) + yield MetadataChangeProposalWrapper( + entityUrn=self.gen_dataset_urn(project_id, dataset_name, table.name), + aspect=view_properties_aspect, + ).as_workunit() + + def gen_snapshot_dataset_workunits( + self, + table: BigqueryTableSnapshot, + columns: List[BigqueryColumn], + project_id: str, + dataset_name: str, + ) -> Iterable[MetadataWorkUnit]: + custom_properties: Dict[str, str] = {} + if table.ddl: + custom_properties["snapshot_ddl"] = table.ddl + if table.snapshot_time: + custom_properties["snapshot_time"] = str(table.snapshot_time) + if table.size_in_bytes: + custom_properties["size_in_bytes"] = str(table.size_in_bytes) + if table.rows_count: + custom_properties["rows_count"] = str(table.rows_count) + yield from self.gen_dataset_workunits( + table=table, + columns=columns, + project_id=project_id, + dataset_name=dataset_name, + sub_types=[DatasetSubTypes.BIGQUERY_TABLE_SNAPSHOT], + custom_properties=custom_properties, + ) + + def gen_dataset_workunits( + self, + table: Union[BigqueryTable, BigqueryView, BigqueryTableSnapshot], + columns: List[BigqueryColumn], + project_id: str, + dataset_name: str, + sub_types: List[str], + tags_to_add: Optional[List[str]] = None, + custom_properties: Optional[Dict[str, str]] = None, + ) -> Iterable[MetadataWorkUnit]: + dataset_urn = self.gen_dataset_urn(project_id, dataset_name, table.name) + + status = Status(removed=False) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=status + ).as_workunit() + + datahub_dataset_name = BigqueryTableIdentifier( + project_id, dataset_name, table.name + ) + + yield self.gen_schema_metadata( + dataset_urn, table, columns, datahub_dataset_name + ) + + dataset_properties = DatasetProperties( + name=datahub_dataset_name.get_table_display_name(), + description=( + unquote_and_decode_unicode_escape_seq(table.comment) + if table.comment + else "" + ), + qualifiedName=str(datahub_dataset_name), + created=( + TimeStamp(time=int(table.created.timestamp() * 1000)) + if table.created is not None + else None + ), + lastModified=( + TimeStamp(time=int(table.last_altered.timestamp() * 1000)) + if table.last_altered is not None + else None + ), + externalUrl=( + BQ_EXTERNAL_TABLE_URL_TEMPLATE.format( + project=project_id, dataset=dataset_name, table=table.name + ) + if self.config.include_external_url + else None + ), + ) + if custom_properties: + dataset_properties.customProperties.update(custom_properties) + + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=dataset_properties + ).as_workunit() + + if tags_to_add: + yield self.gen_tags_aspect_workunit(dataset_urn, tags_to_add) + + yield from add_table_to_schema_container( + dataset_urn=dataset_urn, + parent_container_key=self.gen_dataset_key(project_id, dataset_name), + ) + yield self.get_dataplatform_instance_aspect( + dataset_urn=dataset_urn, project_id=project_id + ) + + subTypes = SubTypes(typeNames=sub_types) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=subTypes + ).as_workunit() + + if self.domain_registry: + yield from get_domain_wu( + dataset_name=str(datahub_dataset_name), + entity_urn=dataset_urn, + domain_registry=self.domain_registry, + domain_config=self.config.domain, + ) + + def gen_tags_aspect_workunit( + self, dataset_urn: str, tags_to_add: List[str] + ) -> MetadataWorkUnit: + tags = GlobalTagsClass( + tags=[TagAssociationClass(tag_to_add) for tag_to_add in tags_to_add] + ) + return MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=tags + ).as_workunit() + + def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: + schema_fields: List[SchemaField] = [] + + # Below line affects HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR in global scope + # TODO: Refractor this such that + # converter = HiveColumnToAvroConverter(struct_type_separator=" "); + # converter.get_schema_fields_for_hive_column(...) + original_struct_type_separator = ( + HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR + ) + HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR = " " + _COMPLEX_TYPE = re.compile("^(struct|array)") + last_id = -1 + for col in columns: + # if col.data_type is empty that means this column is part of a complex type + if col.data_type is None or _COMPLEX_TYPE.match(col.data_type.lower()): + # If the we have seen the ordinal position that most probably means we already processed this complex type + if last_id != col.ordinal_position: + schema_fields.extend( + get_schema_fields_for_hive_column( + col.name, col.data_type.lower(), description=col.comment + ) + ) + + # We have to add complex type comments to the correct level + if col.comment: + for idx, field in enumerate(schema_fields): + # Remove all the [version=2.0].[type=struct]. tags to get the field path + if ( + re.sub( + r"\[.*?\]\.", + repl="", + string=field.fieldPath.lower(), + count=0, + flags=re.MULTILINE, + ) + == col.field_path.lower() + ): + field.description = col.comment + schema_fields[idx] = field + break + else: + tags = [] + if col.is_partition_column: + tags.append( + TagAssociationClass(make_tag_urn(Constants.TAG_PARTITION_KEY)) + ) + + if col.cluster_column_position is not None: + tags.append( + TagAssociationClass( + make_tag_urn( + f"{CLUSTERING_COLUMN_TAG}_{col.cluster_column_position}" + ) + ) + ) + + if col.policy_tags: + for policy_tag in col.policy_tags: + tags.append(TagAssociationClass(make_tag_urn(policy_tag))) + field = SchemaField( + fieldPath=col.name, + type=SchemaFieldDataType( + self.BIGQUERY_FIELD_TYPE_MAPPINGS.get(col.data_type, NullType)() + ), + nativeDataType=col.data_type, + description=col.comment, + nullable=col.is_nullable, + globalTags=GlobalTagsClass(tags=tags), + ) + schema_fields.append(field) + last_id = col.ordinal_position + HiveColumnToAvroConverter._STRUCT_TYPE_SEPARATOR = ( + original_struct_type_separator + ) + return schema_fields + + def gen_schema_metadata( + self, + dataset_urn: str, + table: Union[BigqueryTable, BigqueryView, BigqueryTableSnapshot], + columns: List[BigqueryColumn], + dataset_name: BigqueryTableIdentifier, + ) -> MetadataWorkUnit: + schema_metadata = SchemaMetadata( + schemaName=str(dataset_name), + platform=make_data_platform_urn(self.platform), + version=0, + hash="", + platformSchema=MySqlDDL(tableSchema=""), + # fields=[], + fields=self.gen_schema_fields(columns), + ) + + if self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser: + self.sql_parser_schema_resolver.add_schema_metadata( + dataset_urn, schema_metadata + ) + + return MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=schema_metadata + ).as_workunit() + + def get_tables_for_dataset( + self, + project_id: str, + dataset_name: str, + ) -> Iterable[BigqueryTable]: + # In bigquery there is no way to query all tables in a Project id + with PerfTimer() as timer: + # Partitions view throw exception if we try to query partition info for too many tables + # so we have to limit the number of tables we query partition info. + # The conn.list_tables returns table infos that information_schema doesn't contain and this + # way we can merge that info with the queried one. + # https://cloud.google.com/bigquery/docs/information-schema-partitions + max_batch_size: int = ( + self.config.number_of_datasets_process_in_batch + if not self.config.is_profiling_enabled() + else self.config.number_of_datasets_process_in_batch_if_profiling_enabled + ) + + # We get the list of tables in the dataset to get core table properties and to be able to process the tables in batches + # We collect only the latest shards from sharded tables (tables with _YYYYMMDD suffix) and ignore temporary tables + table_items = self.get_core_table_details( + dataset_name, project_id, self.config.temp_table_dataset_prefix + ) + + items_to_get: Dict[str, TableListItem] = {} + for table_item in table_items: + items_to_get[table_item] = table_items[table_item] + if len(items_to_get) % max_batch_size == 0: + yield from self.bigquery_data_dictionary.get_tables_for_dataset( + project_id, + dataset_name, + items_to_get, + with_data_read_permission=self.config.have_table_data_read_permission, + report=self.report, + ) + items_to_get.clear() + + if items_to_get: + yield from self.bigquery_data_dictionary.get_tables_for_dataset( + project_id, + dataset_name, + items_to_get, + with_data_read_permission=self.config.have_table_data_read_permission, + report=self.report, + ) + + self.report.metadata_extraction_sec[f"{project_id}.{dataset_name}"] = round( + timer.elapsed_seconds(), 2 + ) + + def get_core_table_details( + self, dataset_name: str, project_id: str, temp_table_dataset_prefix: str + ) -> Dict[str, TableListItem]: + table_items: Dict[str, TableListItem] = {} + # Dict to store sharded table and the last seen max shard id + sharded_tables: Dict[str, TableListItem] = {} + + for table in self.bigquery_data_dictionary.list_tables( + dataset_name, project_id + ): + table_identifier = BigqueryTableIdentifier( + project_id=project_id, + dataset=dataset_name, + table=table.table_id, + ) + + if table.table_type == "VIEW": + if ( + not self.config.include_views + or not self.config.view_pattern.allowed( + table_identifier.raw_table_name() + ) + ): + self.report.report_dropped(table_identifier.raw_table_name()) + continue + else: + if not self.config.table_pattern.allowed( + table_identifier.raw_table_name() + ): + self.report.report_dropped(table_identifier.raw_table_name()) + continue + + _, shard = BigqueryTableIdentifier.get_table_and_shard( + table_identifier.table + ) + table_name = table_identifier.get_table_name().split(".")[-1] + + # Sharded tables look like: table_20220120 + # For sharded tables we only process the latest shard and ignore the rest + # to find the latest shard we iterate over the list of tables and store the maximum shard id + # We only have one special case where the table name is a date `20220110` + # in this case we merge all these tables under dataset name as table name. + # For example some_dataset.20220110 will be turned to some_dataset.some_dataset + # It seems like there are some bigquery user who uses this non-standard way of sharding the tables. + if shard: + if table_name not in sharded_tables: + sharded_tables[table_name] = table + continue + + stored_table_identifier = BigqueryTableIdentifier( + project_id=project_id, + dataset=dataset_name, + table=sharded_tables[table_name].table_id, + ) + _, stored_shard = BigqueryTableIdentifier.get_table_and_shard( + stored_table_identifier.table + ) + # When table is none, we use dataset_name as table_name + assert stored_shard + if stored_shard < shard: + sharded_tables[table_name] = table + continue + elif str(table_identifier).startswith(temp_table_dataset_prefix): + logger.debug(f"Dropping temporary table {table_identifier.table}") + self.report.report_dropped(table_identifier.raw_table_name()) + continue + + table_items[table.table_id] = table + + # Adding maximum shards to the list of tables + table_items.update({value.table_id: value for value in sharded_tables.values()}) + + return table_items diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py new file mode 100644 index 00000000000000..3aac78c154b2ee --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_test_connection.py @@ -0,0 +1,178 @@ +import logging +from datetime import datetime, timedelta +from typing import Dict, List, Union + +from google.cloud import bigquery + +from datahub.ingestion.api.source import ( + CapabilityReport, + SourceCapability, + TestConnectionReport, +) +from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report +from datahub.ingestion.source.bigquery_v2.bigquery_schema import BigQuerySchemaApi +from datahub.ingestion.source.bigquery_v2.lineage import BigqueryLineageExtractor +from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor +from datahub.sql_parsing.schema_resolver import SchemaResolver + +logger: logging.Logger = logging.getLogger(__name__) + + +class BigQueryTestConnection: + @staticmethod + def test_connection(config_dict: dict) -> TestConnectionReport: + test_report = TestConnectionReport() + _report: Dict[Union[SourceCapability, str], CapabilityReport] = dict() + + try: + connection_conf = BigQueryV2Config.parse_obj_allow_extras(config_dict) + client: bigquery.Client = connection_conf.get_bigquery_client() + assert client + + test_report.basic_connectivity = BigQueryTestConnection.connectivity_test( + client + ) + + connection_conf.start_time = datetime.now() + connection_conf.end_time = datetime.now() + timedelta(minutes=1) + + report: BigQueryV2Report = BigQueryV2Report() + project_ids: List[str] = [] + projects = client.list_projects() + + for project in projects: + if connection_conf.project_id_pattern.allowed(project.project_id): + project_ids.append(project.project_id) + + metadata_read_capability = ( + BigQueryTestConnection.metadata_read_capability_test( + project_ids, connection_conf + ) + ) + if SourceCapability.SCHEMA_METADATA not in _report: + _report[SourceCapability.SCHEMA_METADATA] = metadata_read_capability + + if connection_conf.include_table_lineage: + lineage_capability = BigQueryTestConnection.lineage_capability_test( + connection_conf, project_ids, report + ) + if SourceCapability.LINEAGE_COARSE not in _report: + _report[SourceCapability.LINEAGE_COARSE] = lineage_capability + + if connection_conf.include_usage_statistics: + usage_capability = BigQueryTestConnection.usage_capability_test( + connection_conf, project_ids, report + ) + if SourceCapability.USAGE_STATS not in _report: + _report[SourceCapability.USAGE_STATS] = usage_capability + + test_report.capability_report = _report + return test_report + + except Exception as e: + test_report.basic_connectivity = CapabilityReport( + capable=False, failure_reason=f"{e}" + ) + return test_report + + @staticmethod + def connectivity_test(client: bigquery.Client) -> CapabilityReport: + ret = client.query("select 1") + if ret.error_result: + return CapabilityReport( + capable=False, failure_reason=f"{ret.error_result['message']}" + ) + else: + return CapabilityReport(capable=True) + + @staticmethod + def metadata_read_capability_test( + project_ids: List[str], config: BigQueryV2Config + ) -> CapabilityReport: + for project_id in project_ids: + try: + logger.info(f"Metadata read capability test for project {project_id}") + client: bigquery.Client = config.get_bigquery_client() + assert client + bigquery_data_dictionary = BigQuerySchemaApi( + BigQueryV2Report().schema_api_perf, client + ) + result = bigquery_data_dictionary.get_datasets_for_project_id( + project_id, 10 + ) + if len(result) == 0: + return CapabilityReport( + capable=False, + failure_reason=f"Dataset query returned empty dataset. It is either empty or no dataset in project {project_id}", + ) + tables = bigquery_data_dictionary.get_tables_for_dataset( + project_id=project_id, + dataset_name=result[0].name, + tables={}, + with_data_read_permission=config.have_table_data_read_permission, + report=BigQueryV2Report(), + ) + if len(list(tables)) == 0: + return CapabilityReport( + capable=False, + failure_reason=f"Tables query did not return any table. It is either empty or no tables in project {project_id}.{result[0].name}", + ) + + except Exception as e: + return CapabilityReport( + capable=False, + failure_reason=f"Dataset query failed with error: {e}", + ) + + return CapabilityReport(capable=True) + + @staticmethod + def lineage_capability_test( + connection_conf: BigQueryV2Config, + project_ids: List[str], + report: BigQueryV2Report, + ) -> CapabilityReport: + lineage_extractor = BigqueryLineageExtractor( + connection_conf, report, lambda ref: "" + ) + for project_id in project_ids: + try: + logger.info(f"Lineage capability test for project {project_id}") + lineage_extractor.test_capability(project_id) + except Exception as e: + return CapabilityReport( + capable=False, + failure_reason=f"Lineage capability test failed with: {e}", + ) + + return CapabilityReport(capable=True) + + @staticmethod + def usage_capability_test( + connection_conf: BigQueryV2Config, + project_ids: List[str], + report: BigQueryV2Report, + ) -> CapabilityReport: + usage_extractor = BigQueryUsageExtractor( + connection_conf, + report, + schema_resolver=SchemaResolver(platform="bigquery"), + dataset_urn_builder=lambda ref: "", + ) + for project_id in project_ids: + try: + logger.info(f"Usage capability test for project {project_id}") + failures_before_test = len(report.failures) + usage_extractor.test_capability(project_id) + if failures_before_test != len(report.failures): + return CapabilityReport( + capable=False, + failure_reason="Usage capability test failed. Check the logs for further info", + ) + except Exception as e: + return CapabilityReport( + capable=False, + failure_reason=f"Usage capability test failed with: {e} for project {project_id}", + ) + return CapabilityReport(capable=True) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index c41207ec67f620..496bd64d3b4fe2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -251,11 +251,6 @@ def get_time_window(self) -> Tuple[datetime, datetime]: else: return self.config.start_time, self.config.end_time - def error(self, log: logging.Logger, key: str, reason: str) -> None: - # TODO: Remove this method. - # Note that this downgrades the error to a warning. - self.report.warning(key, reason) - def _should_ingest_lineage(self) -> bool: if ( self.redundant_run_skip_handler @@ -265,9 +260,9 @@ def _should_ingest_lineage(self) -> bool: ) ): # Skip this run - self.report.report_warning( - "lineage-extraction", - "Skip this run as there was already a run for current ingestion window.", + self.report.warning( + title="Skipped redundant lineage extraction", + message="Skip this run as there was already a run for current ingestion window.", ) return False @@ -345,12 +340,12 @@ def generate_lineage( events, sql_parser_schema_resolver ) except Exception as e: - if project_id: - self.report.lineage_failed_extraction.append(project_id) - self.error( - logger, - "lineage", - f"{project_id}: {e}", + self.report.lineage_failed_extraction.append(project_id) + self.report.warning( + title="Failed to extract lineage", + message="Unexpected error encountered", + context=project_id, + exc=e, ) lineage = {} @@ -481,98 +476,88 @@ def lineage_via_catalog_lineage_api( # Regions to search for BigQuery tables: projects/{project_id}/locations/{region} enabled_regions: List[str] = ["US", "EU"] - try: - lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient() + lineage_client: lineage_v1.LineageClient = lineage_v1.LineageClient() + + data_dictionary = BigQuerySchemaApi( + self.report.schema_api_perf, self.config.get_bigquery_client() + ) - data_dictionary = BigQuerySchemaApi( - self.report.schema_api_perf, self.config.get_bigquery_client() + # Filtering datasets + datasets = list(data_dictionary.get_datasets_for_project_id(project_id)) + project_tables = [] + for dataset in datasets: + # Enables only tables where type is TABLE, VIEW or MATERIALIZED_VIEW (not EXTERNAL) + project_tables.extend( + [ + table + for table in data_dictionary.list_tables(dataset.name, project_id) + if table.table_type in ["TABLE", "VIEW", "MATERIALIZED_VIEW"] + ] ) - # Filtering datasets - datasets = list(data_dictionary.get_datasets_for_project_id(project_id)) - project_tables = [] - for dataset in datasets: - # Enables only tables where type is TABLE, VIEW or MATERIALIZED_VIEW (not EXTERNAL) - project_tables.extend( + lineage_map: Dict[str, Set[LineageEdge]] = {} + curr_date = datetime.now() + for project_table in project_tables: + # Convert project table to .. format + table = f"{project_table.project}.{project_table.dataset_id}.{project_table.table_id}" + + if not is_schema_allowed( + self.config.dataset_pattern, + schema_name=project_table.dataset_id, + db_name=project_table.project, + match_fully_qualified_schema_name=self.config.match_fully_qualified_names, + ) or not self.config.table_pattern.allowed(table): + self.report.num_skipped_lineage_entries_not_allowed[ + project_table.project + ] += 1 + continue + + logger.info("Creating lineage map for table %s", table) + upstreams = set() + downstream_table = lineage_v1.EntityReference() + # fully_qualified_name in format: "bigquery:.." + downstream_table.fully_qualified_name = f"bigquery:{table}" + # Searches in different regions + for region in enabled_regions: + location_request = lineage_v1.SearchLinksRequest( + target=downstream_table, + parent=f"projects/{project_id}/locations/{region.lower()}", + ) + response = lineage_client.search_links(request=location_request) + upstreams.update( [ - table - for table in data_dictionary.list_tables( - dataset.name, project_id + str(lineage.source.fully_qualified_name).replace( + "bigquery:", "" ) - if table.table_type in ["TABLE", "VIEW", "MATERIALIZED_VIEW"] + for lineage in response ] ) - lineage_map: Dict[str, Set[LineageEdge]] = {} - curr_date = datetime.now() - for project_table in project_tables: - # Convert project table to .. format - table = f"{project_table.project}.{project_table.dataset_id}.{project_table.table_id}" - - if not is_schema_allowed( - self.config.dataset_pattern, - schema_name=project_table.dataset_id, - db_name=project_table.project, - match_fully_qualified_schema_name=self.config.match_fully_qualified_names, - ) or not self.config.table_pattern.allowed(table): - self.report.num_skipped_lineage_entries_not_allowed[ - project_table.project - ] += 1 - continue - - logger.info("Creating lineage map for table %s", table) - upstreams = set() - downstream_table = lineage_v1.EntityReference() - # fully_qualified_name in format: "bigquery:.." - downstream_table.fully_qualified_name = f"bigquery:{table}" - # Searches in different regions - for region in enabled_regions: - location_request = lineage_v1.SearchLinksRequest( - target=downstream_table, - parent=f"projects/{project_id}/locations/{region.lower()}", - ) - response = lineage_client.search_links(request=location_request) - upstreams.update( - [ - str(lineage.source.fully_qualified_name).replace( - "bigquery:", "" - ) - for lineage in response - ] - ) - - # Downstream table identifier - destination_table_str = str( - BigQueryTableRef( - table_identifier=BigqueryTableIdentifier(*table.split(".")) - ) + # Downstream table identifier + destination_table_str = str( + BigQueryTableRef( + table_identifier=BigqueryTableIdentifier(*table.split(".")) ) + ) - # Only builds lineage map when the table has upstreams - logger.debug("Found %d upstreams for table %s", len(upstreams), table) - if upstreams: - lineage_map[destination_table_str] = { - LineageEdge( - table=str( - BigQueryTableRef( - table_identifier=BigqueryTableIdentifier.from_string_name( - source_table - ) + # Only builds lineage map when the table has upstreams + logger.debug("Found %d upstreams for table %s", len(upstreams), table) + if upstreams: + lineage_map[destination_table_str] = { + LineageEdge( + table=str( + BigQueryTableRef( + table_identifier=BigqueryTableIdentifier.from_string_name( + source_table ) - ), - column_mapping=frozenset(), - auditStamp=curr_date, - ) - for source_table in upstreams - } - return lineage_map - except Exception as e: - self.error( - logger, - "lineage-exported-catalog-lineage-api", - f"Error: {e}", - ) - raise e + ) + ), + column_mapping=frozenset(), + auditStamp=curr_date, + ) + for source_table in upstreams + } + return lineage_map def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: # We adjust the filter values a bit, since we need to make sure that the join diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index 8c393d1e8a4369..582c312f99098b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -227,8 +227,9 @@ def get_profile_request( if partition is None and bq_table.partition_info: self.report.report_warning( - "profile skipped as partitioned table is empty or partition id or type was invalid", - profile_request.pretty_name, + title="Profile skipped for partitioned table", + message="profile skipped as partitioned table is empty or partition id or type was invalid", + context=profile_request.pretty_name, ) return None if ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 1b95cbf5050161..6824d630a2277a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -358,9 +358,9 @@ def _should_ingest_usage(self) -> bool: ) ): # Skip this run - self.report.report_warning( - "usage-extraction", - "Skip this run as there was already a run for current ingestion window.", + self.report.warning( + title="Skipped redundant usage extraction", + message="Skip this run as there was already a run for current ingestion window.", ) return False @@ -410,8 +410,7 @@ def _get_workunits_internal( ) usage_state.report_disk_usage(self.report) except Exception as e: - logger.error("Error processing usage", exc_info=True) - self.report.report_warning("usage-ingestion", str(e)) + self.report.warning(message="Error processing usage", exc=e) self.report_status("usage-ingestion", False) def generate_read_events_from_query( @@ -477,10 +476,12 @@ def _ingest_events( ) except Exception as e: - logger.warning( - f"Unable to store usage event {audit_event}", exc_info=True + self.report.warning( + message="Unable to store usage event", + context=f"{audit_event}", + exc=e, ) - self._report_error("store-event", e) + logger.info(f"Total number of events aggregated = {num_aggregated}.") if self.report.num_view_query_events > 0: @@ -500,11 +501,11 @@ def _generate_operational_workunits( yield operational_wu self.report.num_operational_stats_workunits_emitted += 1 except Exception as e: - logger.warning( - f"Unable to generate operation workunit for event {audit_event}", - exc_info=True, + self.report.warning( + message="Unable to generate operation workunit", + context=f"{audit_event}", + exc=e, ) - self._report_error("operation-workunit", e) def _generate_usage_workunits( self, usage_state: BigQueryUsageState @@ -541,11 +542,11 @@ def _generate_usage_workunits( ) self.report.num_usage_workunits_emitted += 1 except Exception as e: - logger.warning( - f"Unable to generate usage workunit for bucket {entry.timestamp}, {entry.resource}", - exc_info=True, + self.report.warning( + message="Unable to generate usage statistics workunit", + context=f"{entry.timestamp}, {entry.resource}", + exc=e, ) - self._report_error("statistics-workunit", e) def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: if self.config.use_exported_bigquery_audit_metadata: @@ -559,12 +560,12 @@ def _get_usage_events(self, projects: Iterable[str]) -> Iterable[AuditEvent]: ) yield from self._get_parsed_bigquery_log_events(project_id) except Exception as e: - logger.error( - f"Error getting usage events for project {project_id}", - exc_info=True, - ) self.report.usage_failed_extraction.append(project_id) - self.report.report_warning(f"usage-extraction-{project_id}", str(e)) + self.report.warning( + message="Failed to get some or all usage events for project", + context=project_id, + exc=e, + ) self.report_status(f"usage-extraction-{project_id}", False) self.report.usage_extraction_sec[project_id] = round( @@ -898,12 +899,10 @@ def _get_parsed_bigquery_log_events( self.report.num_usage_parsed_log_entries[project_id] += 1 yield event except Exception as e: - logger.warning( - f"Unable to parse log entry `{entry}` for project {project_id}", - exc_info=True, - ) - self._report_error( - f"log-parse-{project_id}", e, group="usage-log-parse" + self.report.warning( + message="Unable to parse usage log entry", + context=f"`{entry}` for project {project_id}", + exc=e, ) def _generate_filter(self, corrected_start_time, corrected_end_time): @@ -946,13 +945,6 @@ def get_tables_from_query( return parsed_table_refs - def _report_error( - self, label: str, e: Exception, group: Optional[str] = None - ) -> None: - """Report an error that does not constitute a major failure.""" - self.report.usage_error_count[label] += 1 - self.report.report_warning(group or f"usage-{label}", str(e)) - def test_capability(self, project_id: str) -> None: for entry in self._get_parsed_bigquery_log_events(project_id, limit=1): logger.debug(f"Connection test got one {entry}") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index dcc18635de32c3..72f8f8ad793fdd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -1,7 +1,5 @@ -import concurrent.futures import itertools import logging -import queue from typing import Callable, Dict, Iterable, List, Optional, Union from datahub.configuration.pattern_utils import is_schema_allowed @@ -101,6 +99,7 @@ from datahub.metadata.com.linkedin.pegasus2avro.tag import TagProperties from datahub.sql_parsing.sql_parsing_aggregator import SqlParsingAggregator from datahub.utilities.registries.domain_registry import DomainRegistry +from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor logger = logging.getLogger(__name__) @@ -318,41 +317,22 @@ def _process_db_schemas( snowflake_db: SnowflakeDatabase, db_tables: Dict[str, List[SnowflakeTable]], ) -> Iterable[MetadataWorkUnit]: - q: "queue.Queue[MetadataWorkUnit]" = queue.Queue(maxsize=100) - - def _process_schema_worker(snowflake_schema: SnowflakeSchema) -> None: + def _process_schema_worker( + snowflake_schema: SnowflakeSchema, + ) -> Iterable[MetadataWorkUnit]: for wu in self._process_schema( snowflake_schema, snowflake_db.name, db_tables ): - q.put(wu) - - with concurrent.futures.ThreadPoolExecutor( - max_workers=SCHEMA_PARALLELISM - ) as executor: - futures = [] - for snowflake_schema in snowflake_db.schemas: - f = executor.submit(_process_schema_worker, snowflake_schema) - futures.append(f) - - # Read from the queue and yield the work units until all futures are done. - while True: - if not q.empty(): - while not q.empty(): - yield q.get_nowait() - else: - try: - yield q.get(timeout=0.2) - except queue.Empty: - pass - - # Filter out the done futures. - futures = [f for f in futures if not f.done()] - if not futures: - break - - # Yield the remaining work units. This theoretically should not happen, but adding it just in case. - while not q.empty(): - yield q.get_nowait() + yield wu + + for wu in ThreadedIteratorExecutor.process( + worker_func=_process_schema_worker, + args_list=[ + (snowflake_schema,) for snowflake_schema in snowflake_db.schemas + ], + max_workers=SCHEMA_PARALLELISM, + ): + yield wu def fetch_schemas_for_database( self, snowflake_db: SnowflakeDatabase, db_name: str diff --git a/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py b/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py new file mode 100644 index 00000000000000..216fa155035d3e --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/threaded_iterator_executor.py @@ -0,0 +1,52 @@ +import concurrent.futures +import contextlib +import queue +from typing import Any, Callable, Generator, Iterable, Tuple, TypeVar + +T = TypeVar("T") + + +class ThreadedIteratorExecutor: + """ + Executes worker functions of type `Callable[..., Iterable[T]]` in parallel threads, + yielding items of type `T` as they become available. + """ + + @classmethod + def process( + cls, + worker_func: Callable[..., Iterable[T]], + args_list: Iterable[Tuple[Any, ...]], + max_workers: int, + ) -> Generator[T, None, None]: + + out_q: queue.Queue[T] = queue.Queue() + + def _worker_wrapper( + worker_func: Callable[..., Iterable[T]], *args: Any + ) -> None: + for item in worker_func(*args): + out_q.put(item) + + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [] + for args in args_list: + future = executor.submit(_worker_wrapper, worker_func, *args) + futures.append(future) + # Read from the queue and yield the work units until all futures are done. + while True: + if not out_q.empty(): + while not out_q.empty(): + yield out_q.get_nowait() + else: + with contextlib.suppress(queue.Empty): + yield out_q.get(timeout=0.2) + + # Filter out the done futures. + futures = [f for f in futures if not f.done()] + if not futures: + break + + # Yield the remaining work units. This theoretically should not happen, but adding it just in case. + while not out_q.empty(): + yield out_q.get_nowait() diff --git a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py index a24b6174eb9250..762c73d2a55c60 100644 --- a/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py +++ b/metadata-ingestion/tests/integration/bigquery_v2/test_bigquery.py @@ -11,7 +11,6 @@ DynamicTypedClassifierConfig, ) from datahub.ingestion.glossary.datahub_classifier import DataHubClassifierConfig -from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source from datahub.ingestion.source.bigquery_v2.bigquery_data_reader import BigQueryDataReader from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( BigqueryColumn, @@ -19,6 +18,9 @@ BigQuerySchemaApi, BigqueryTable, ) +from datahub.ingestion.source.bigquery_v2.bigquery_schema_gen import ( + BigQuerySchemaGenerator, +) from tests.test_helpers import mce_helpers from tests.test_helpers.state_helpers import run_and_get_pipeline @@ -39,7 +41,7 @@ def random_email(): @freeze_time(FROZEN_TIME) @patch.object(BigQuerySchemaApi, "get_tables_for_dataset") -@patch.object(BigqueryV2Source, "get_core_table_details") +@patch.object(BigQuerySchemaGenerator, "get_core_table_details") @patch.object(BigQuerySchemaApi, "get_datasets_for_project_id") @patch.object(BigQuerySchemaApi, "get_columns_for_dataset") @patch.object(BigQueryDataReader, "get_sample_data_for_table") diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index b58f35c0deef51..ea32db0ef27574 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -32,6 +32,9 @@ BigqueryTableSnapshot, BigqueryView, ) +from datahub.ingestion.source.bigquery_v2.bigquery_schema_gen import ( + BigQuerySchemaGenerator, +) from datahub.ingestion.source.bigquery_v2.lineage import ( LineageEdge, LineageEdgeColumnMapping, @@ -231,8 +234,9 @@ def test_get_dataplatform_instance_aspect_returns_project_id(get_bq_client_mock) config = BigQueryV2Config.parse_obj({"include_data_platform_instance": True}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + schema_gen = source.bq_schema_extractor - data_platform_instance = source.get_dataplatform_instance_aspect( + data_platform_instance = schema_gen.get_dataplatform_instance_aspect( "urn:li:test", project_id ) metadata = data_platform_instance.metadata @@ -246,8 +250,9 @@ def test_get_dataplatform_instance_aspect_returns_project_id(get_bq_client_mock) def test_get_dataplatform_instance_default_no_instance(get_bq_client_mock): config = BigQueryV2Config.parse_obj({}) source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + schema_gen = source.bq_schema_extractor - data_platform_instance = source.get_dataplatform_instance_aspect( + data_platform_instance = schema_gen.get_dataplatform_instance_aspect( "urn:li:test", "project_id" ) metadata = data_platform_instance.metadata @@ -395,8 +400,9 @@ def test_gen_table_dataset_workunits(get_bq_client_mock, bigquery_table): source: BigqueryV2Source = BigqueryV2Source( config=config, ctx=PipelineContext(run_id="test") ) + schema_gen = source.bq_schema_extractor - gen = source.gen_table_dataset_workunits( + gen = schema_gen.gen_table_dataset_workunits( bigquery_table, [], project_id, dataset_name ) mcp = cast(MetadataChangeProposalClass, next(iter(gen)).metadata) @@ -710,9 +716,10 @@ def test_table_processing_logic(get_bq_client_mock, data_dictionary_mock): data_dictionary_mock.get_tables_for_dataset.return_value = None source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + schema_gen = source.bq_schema_extractor _ = list( - source.get_tables_for_dataset( + schema_gen.get_tables_for_dataset( project_id="test-project", dataset_name="test-dataset" ) ) @@ -784,9 +791,10 @@ def test_table_processing_logic_date_named_tables( data_dictionary_mock.get_tables_for_dataset.return_value = None source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test")) + schema_gen = source.bq_schema_extractor _ = list( - source.get_tables_for_dataset( + schema_gen.get_tables_for_dataset( project_id="test-project", dataset_name="test-dataset" ) ) @@ -882,7 +890,9 @@ def test_get_views_for_dataset( assert list(views) == [bigquery_view_1, bigquery_view_2] -@patch.object(BigqueryV2Source, "gen_dataset_workunits", lambda *args, **kwargs: []) +@patch.object( + BigQuerySchemaGenerator, "gen_dataset_workunits", lambda *args, **kwargs: [] +) @patch.object(BigQueryV2Config, "get_bigquery_client") def test_gen_view_dataset_workunits( get_bq_client_mock, bigquery_view_1, bigquery_view_2 @@ -897,8 +907,9 @@ def test_gen_view_dataset_workunits( source: BigqueryV2Source = BigqueryV2Source( config=config, ctx=PipelineContext(run_id="test") ) + schema_gen = source.bq_schema_extractor - gen = source.gen_view_dataset_workunits( + gen = schema_gen.gen_view_dataset_workunits( bigquery_view_1, [], project_id, dataset_name ) mcp = cast(MetadataChangeProposalClass, next(iter(gen)).metadata) @@ -908,7 +919,7 @@ def test_gen_view_dataset_workunits( viewLogic=bigquery_view_1.view_definition, ) - gen = source.gen_view_dataset_workunits( + gen = schema_gen.gen_view_dataset_workunits( bigquery_view_2, [], project_id, dataset_name ) mcp = cast(MetadataChangeProposalClass, next(iter(gen)).metadata) @@ -990,8 +1001,9 @@ def test_gen_snapshot_dataset_workunits(get_bq_client_mock, bigquery_snapshot): source: BigqueryV2Source = BigqueryV2Source( config=config, ctx=PipelineContext(run_id="test") ) + schema_gen = source.bq_schema_extractor - gen = source.gen_snapshot_dataset_workunits( + gen = schema_gen.gen_snapshot_dataset_workunits( bigquery_snapshot, [], project_id, dataset_name ) mcp = cast(MetadataChangeProposalWrapper, list(gen)[2].metadata) diff --git a/metadata-ingestion/tests/unit/utilities/test_threaded_iterator_executor.py b/metadata-ingestion/tests/unit/utilities/test_threaded_iterator_executor.py new file mode 100644 index 00000000000000..35c44c7b4a8479 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_threaded_iterator_executor.py @@ -0,0 +1,14 @@ +from datahub.utilities.threaded_iterator_executor import ThreadedIteratorExecutor + + +def test_threaded_iterator_executor(): + def table_of(i): + for j in range(1, 11): + yield f"{i}x{j}={i*j}" + + assert { + res + for res in ThreadedIteratorExecutor.process( + table_of, [(i,) for i in range(1, 30)], max_workers=2 + ) + } == {x for i in range(1, 30) for x in table_of(i)} From bb24651264e3076115b1223637e9284f575d1d70 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Tue, 16 Jul 2024 12:27:37 -0700 Subject: [PATCH 14/18] fix(airflow): add error handling around render_template() (#10907) --- .../src/datahub_airflow_plugin/datahub_listener.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py index 6ef4f831522cb9..c87f7f8fb1a8ee 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py @@ -362,8 +362,13 @@ def on_task_instance_running( # Render templates in a copy of the task instance. # This is necessary to get the correct operator args in the extractors. - task_instance = copy.deepcopy(task_instance) - task_instance.render_templates() + try: + task_instance = copy.deepcopy(task_instance) + task_instance.render_templates() + except Exception as e: + logger.info( + f"Error rendering templates in DataHub listener. Jinja-templated variables will not be extracted correctly: {e}" + ) # The type ignore is to placate mypy on Airflow 2.1.x. dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined] From a8b07c5fe6dc55eebf44e63b35cd957709c56a26 Mon Sep 17 00:00:00 2001 From: Nadav Gross <33874964+nadavgross@users.noreply.github.com> Date: Tue, 16 Jul 2024 22:28:14 +0300 Subject: [PATCH 15/18] feat(ingestion/sqlglot): add optional `default_dialect` parameter to sqlglot lineage (#10830) --- .../src/datahub/ingestion/graph/client.py | 2 ++ .../src/datahub/sql_parsing/sqlglot_lineage.py | 16 +++++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 7ba412b3e772c0..1d6097da231f8f 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -1241,6 +1241,7 @@ def parse_sql_lineage( env: str = DEFAULT_ENV, default_db: Optional[str] = None, default_schema: Optional[str] = None, + default_dialect: Optional[str] = None, ) -> "SqlParsingResult": from datahub.sql_parsing.sqlglot_lineage import sqlglot_lineage @@ -1254,6 +1255,7 @@ def parse_sql_lineage( schema_resolver=schema_resolver, default_db=default_db, default_schema=default_schema, + default_dialect=default_dialect, ) def create_tag(self, tag_name: str) -> str: diff --git a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py index 9c2a588a577cc7..976ff8bcc9b3ff 100644 --- a/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/sql_parsing/sqlglot_lineage.py @@ -843,8 +843,14 @@ def _sqlglot_lineage_inner( schema_resolver: SchemaResolverInterface, default_db: Optional[str] = None, default_schema: Optional[str] = None, + default_dialect: Optional[str] = None, ) -> SqlParsingResult: - dialect = get_dialect(schema_resolver.platform) + + if not default_dialect: + dialect = get_dialect(schema_resolver.platform) + else: + dialect = get_dialect(default_dialect) + if is_dialect_instance(dialect, "snowflake"): # in snowflake, table identifiers must be uppercased to match sqlglot's behavior. if default_db: @@ -1003,6 +1009,7 @@ def sqlglot_lineage( schema_resolver: SchemaResolverInterface, default_db: Optional[str] = None, default_schema: Optional[str] = None, + default_dialect: Optional[str] = None, ) -> SqlParsingResult: """Parse a SQL statement and generate lineage information. @@ -1020,8 +1027,9 @@ def sqlglot_lineage( can be brittle with respect to missing schema information and complex SQL logic like UNNESTs. - The SQL dialect is inferred from the schema_resolver's platform. The - set of supported dialects is the same as sqlglot's. See their + The SQL dialect can be given as an argument called default_dialect or it can + be inferred from the schema_resolver's platform. + The set of supported dialects is the same as sqlglot's. See their `documentation `_ for the full list. @@ -1035,6 +1043,7 @@ def sqlglot_lineage( schema_resolver: The schema resolver to use for resolving table schemas. default_db: The default database to use for unqualified table names. default_schema: The default schema to use for unqualified table names. + default_dialect: A default dialect to override the dialect provided by 'schema_resolver'. Returns: A SqlParsingResult object containing the parsed lineage information. @@ -1059,6 +1068,7 @@ def sqlglot_lineage( schema_resolver=schema_resolver, default_db=default_db, default_schema=default_schema, + default_dialect=default_dialect, ) except Exception as e: return SqlParsingResult.make_from_error(e) From 1565fb01028efaaff5bfdef7e429f63eb0502b2d Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Tue, 16 Jul 2024 16:56:51 -0500 Subject: [PATCH 16/18] feat(mcp-mutator): new mcp mutator plugin (#10904) --- .../linkedin/metadata/aspect/ReadItem.java | 6 +- .../metadata/aspect/batch/AspectsBatch.java | 7 + .../metadata/aspect/plugins/PluginSpec.java | 20 +- .../aspect/plugins/hooks/MCLSideEffect.java | 2 +- .../aspect/plugins/hooks/MCPSideEffect.java | 4 +- .../aspect/plugins/hooks/MutationHook.java | 26 +- .../validation/AspectPayloadValidator.java | 4 +- metadata-io/build.gradle | 1 + metadata-io/metadata-io-api/build.gradle | 7 + .../entity/ebean/batch/AspectsBatchImpl.java | 26 +- .../entity/ebean/batch/ProposedItem.java | 80 +++++ .../ebean/batch/AspectsBatchImplTest.java | 320 ++++++++++++++++++ .../test/resources/AspectsBatchImplTest.yaml | 19 ++ .../aspect/hooks/IgnoreUnknownMutator.java | 80 +++++ .../hooks/IgnoreUnknownMutatorTest.java | 143 ++++++++ .../kafka/MaeConsumerApplication.java | 1 + .../MCLSpringCommonTestConfiguration.java | 3 + .../kafka/MceConsumerApplication.java | 3 +- .../src/main/resources/entity-registry.yml | 6 + .../metadata/context/RequestContext.java | 1 + .../src/main/resources/application.yaml | 2 + .../ConfigEntityRegistryFactory.java | 5 +- .../SpringStandardPluginConfiguration.java | 33 ++ .../metadata/aspect/SpringPluginFactory.java | 12 +- .../linkedin/gms/CommonApplicationConfig.java | 1 + 25 files changed, 786 insertions(+), 26 deletions(-) create mode 100644 metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java create mode 100644 metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java create mode 100644 metadata-io/metadata-io-api/src/test/resources/AspectsBatchImplTest.yaml create mode 100644 metadata-io/src/main/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutator.java create mode 100644 metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java create mode 100644 metadata-service/factories/src/main/java/com/linkedin/gms/factory/plugins/SpringStandardPluginConfiguration.java diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/ReadItem.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/ReadItem.java index 342b5376d8a755..106596bf80ccf0 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/ReadItem.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/ReadItem.java @@ -5,6 +5,7 @@ import com.linkedin.data.template.RecordTemplate; import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.mxe.GenericAspect; import com.linkedin.mxe.SystemMetadata; import java.lang.reflect.InvocationTargetException; import javax.annotation.Nonnull; @@ -26,6 +27,9 @@ public interface ReadItem { */ @Nonnull default String getAspectName() { + if (getAspectSpec() == null) { + return GenericAspect.dataSchema().getName(); + } return getAspectSpec().getName(); } @@ -72,6 +76,6 @@ static T getAspect(Class clazz, @Nullable RecordTemplate recordTemplate) * * @return aspect's specification */ - @Nonnull + @Nullable AspectSpec getAspectSpec(); } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java index a302632e1936fd..77820948b00cbc 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/batch/AspectsBatch.java @@ -84,6 +84,13 @@ static void applyWriteMutationHooks( } } + default Stream applyProposalMutationHooks( + Collection proposedItems, @Nonnull RetrieverContext retrieverContext) { + return retrieverContext.getAspectRetriever().getEntityRegistry().getAllMutationHooks().stream() + .flatMap( + mutationHook -> mutationHook.applyProposalMutation(proposedItems, retrieverContext)); + } + default ValidationExceptionCollection validateProposed( Collection mcpItems) { return validateProposed(mcpItems, getRetrieverContext()); diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/PluginSpec.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/PluginSpec.java index 1adb1be81ecc1d..f99dd18d3c9c1f 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/PluginSpec.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/PluginSpec.java @@ -3,7 +3,6 @@ import com.linkedin.common.urn.Urn; import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; -import com.linkedin.metadata.models.AspectSpec; import com.linkedin.metadata.models.EntitySpec; import javax.annotation.Nonnull; import javax.annotation.Nullable; @@ -25,20 +24,13 @@ public boolean enabled() { } public boolean shouldApply( - @Nullable ChangeType changeType, @Nonnull Urn entityUrn, @Nonnull AspectSpec aspectSpec) { - return shouldApply(changeType, entityUrn.getEntityType(), aspectSpec); + @Nullable ChangeType changeType, @Nonnull Urn entityUrn, @Nonnull String aspectName) { + return shouldApply(changeType, entityUrn.getEntityType(), aspectName); } public boolean shouldApply( - @Nullable ChangeType changeType, - @Nonnull EntitySpec entitySpec, - @Nonnull AspectSpec aspectSpec) { - return shouldApply(changeType, entitySpec.getName(), aspectSpec.getName()); - } - - public boolean shouldApply( - @Nullable ChangeType changeType, @Nonnull String entityName, @Nonnull AspectSpec aspectSpec) { - return shouldApply(changeType, entityName, aspectSpec.getName()); + @Nullable ChangeType changeType, @Nonnull EntitySpec entitySpec, @Nonnull String aspectName) { + return shouldApply(changeType, entitySpec.getName(), aspectName); } public boolean shouldApply( @@ -49,8 +41,8 @@ && isChangeTypeSupported(changeType) } protected boolean isEntityAspectSupported( - @Nonnull EntitySpec entitySpec, @Nonnull AspectSpec aspectSpec) { - return isEntityAspectSupported(entitySpec.getName(), aspectSpec.getName()); + @Nonnull EntitySpec entitySpec, @Nonnull String aspectName) { + return isEntityAspectSupported(entitySpec.getName(), aspectName); } protected boolean isEntityAspectSupported( diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MCLSideEffect.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MCLSideEffect.java index 57016404648d50..853c2ef5f796c2 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MCLSideEffect.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MCLSideEffect.java @@ -24,7 +24,7 @@ public final Stream apply( @Nonnull Collection batchItems, @Nonnull RetrieverContext retrieverContext) { return applyMCLSideEffect( batchItems.stream() - .filter(item -> shouldApply(item.getChangeType(), item.getUrn(), item.getAspectSpec())) + .filter(item -> shouldApply(item.getChangeType(), item.getUrn(), item.getAspectName())) .collect(Collectors.toList()), retrieverContext); } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MCPSideEffect.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MCPSideEffect.java index 52920d8c6f3966..ce49dd057bc3ed 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MCPSideEffect.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MCPSideEffect.java @@ -25,7 +25,7 @@ public final Stream apply( Collection changeMCPS, @Nonnull RetrieverContext retrieverContext) { return applyMCPSideEffect( changeMCPS.stream() - .filter(item -> shouldApply(item.getChangeType(), item.getUrn(), item.getAspectSpec())) + .filter(item -> shouldApply(item.getChangeType(), item.getUrn(), item.getAspectName())) .collect(Collectors.toList()), retrieverContext); } @@ -41,7 +41,7 @@ public final Stream postApply( Collection mclItems, @Nonnull RetrieverContext retrieverContext) { return postMCPSideEffect( mclItems.stream() - .filter(item -> shouldApply(item.getChangeType(), item.getUrn(), item.getAspectSpec())) + .filter(item -> shouldApply(item.getChangeType(), item.getUrn(), item.getAspectName())) .collect(Collectors.toList()), retrieverContext); } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MutationHook.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MutationHook.java index c067954912a032..b2fd997d49444d 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MutationHook.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/hooks/MutationHook.java @@ -3,6 +3,7 @@ import com.linkedin.metadata.aspect.ReadItem; import com.linkedin.metadata.aspect.RetrieverContext; import com.linkedin.metadata.aspect.batch.ChangeMCP; +import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.PluginSpec; import com.linkedin.util.Pair; import java.util.Collection; @@ -24,7 +25,7 @@ public final Stream> applyWriteMutation( @Nonnull Collection changeMCPS, @Nonnull RetrieverContext retrieverContext) { return writeMutation( changeMCPS.stream() - .filter(i -> shouldApply(i.getChangeType(), i.getEntitySpec(), i.getAspectSpec())) + .filter(i -> shouldApply(i.getChangeType(), i.getEntitySpec(), i.getAspectName())) .collect(Collectors.toList()), retrieverContext); } @@ -34,7 +35,23 @@ public final Stream> applyReadMutation( @Nonnull Collection items, @Nonnull RetrieverContext retrieverContext) { return readMutation( items.stream() - .filter(i -> isEntityAspectSupported(i.getEntitySpec(), i.getAspectSpec())) + .filter(i -> isEntityAspectSupported(i.getEntitySpec(), i.getAspectName())) + .collect(Collectors.toList()), + retrieverContext); + } + + /** + * Apply Proposal mutations prior to validation + * + * @param mcpItems wrapper for MCP + * @param retrieverContext retriever context + * @return stream of mutated Proposal items + */ + public final Stream applyProposalMutation( + @Nonnull Collection mcpItems, @Nonnull RetrieverContext retrieverContext) { + return proposalMutation( + mcpItems.stream() + .filter(i -> shouldApply(i.getChangeType(), i.getEntitySpec(), i.getAspectName())) .collect(Collectors.toList()), retrieverContext); } @@ -48,4 +65,9 @@ protected Stream> writeMutation( @Nonnull Collection changeMCPS, @Nonnull RetrieverContext retrieverContext) { return changeMCPS.stream().map(i -> Pair.of(i, false)); } + + protected Stream proposalMutation( + @Nonnull Collection mcpItems, @Nonnull RetrieverContext retrieverContext) { + return Stream.empty(); + } } diff --git a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/AspectPayloadValidator.java b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/AspectPayloadValidator.java index b39c38c2768a7f..4083329899fee0 100644 --- a/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/AspectPayloadValidator.java +++ b/entity-registry/src/main/java/com/linkedin/metadata/aspect/plugins/validation/AspectPayloadValidator.java @@ -22,7 +22,7 @@ public final Stream validateProposed( @Nonnull RetrieverContext retrieverContext) { return validateProposedAspects( mcpItems.stream() - .filter(i -> shouldApply(i.getChangeType(), i.getUrn(), i.getAspectSpec())) + .filter(i -> shouldApply(i.getChangeType(), i.getUrn(), i.getAspectName())) .collect(Collectors.toList()), retrieverContext); } @@ -37,7 +37,7 @@ public final Stream validatePreCommit( @Nonnull Collection changeMCPs, @Nonnull RetrieverContext retrieverContext) { return validatePreCommitAspects( changeMCPs.stream() - .filter(i -> shouldApply(i.getChangeType(), i.getUrn(), i.getAspectSpec())) + .filter(i -> shouldApply(i.getChangeType(), i.getUrn(), i.getAspectName())) .collect(Collectors.toList()), retrieverContext); } diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index 6666e335446884..ff29cb5fff47d2 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -21,6 +21,7 @@ dependencies { api project(':metadata-service:services') api project(':metadata-operation-context') + implementation spec.product.pegasus.restliServer implementation spec.product.pegasus.data implementation spec.product.pegasus.generator diff --git a/metadata-io/metadata-io-api/build.gradle b/metadata-io/metadata-io-api/build.gradle index bd79e8cb3ddefb..b8028fad07bb65 100644 --- a/metadata-io/metadata-io-api/build.gradle +++ b/metadata-io/metadata-io-api/build.gradle @@ -8,4 +8,11 @@ dependencies { implementation project(':metadata-utils') compileOnly externalDependency.lombok annotationProcessor externalDependency.lombok + + testImplementation(externalDependency.testng) + testImplementation(externalDependency.mockito) + testImplementation(testFixtures(project(":entity-registry"))) + testImplementation project(':metadata-operation-context') + testImplementation externalDependency.lombok + testAnnotationProcessor externalDependency.lombok } diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java index 0914df744e413a..a23f6ab175046b 100644 --- a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImpl.java @@ -8,6 +8,7 @@ import com.linkedin.metadata.aspect.batch.AspectsBatch; import com.linkedin.metadata.aspect.batch.BatchItem; import com.linkedin.metadata.aspect.batch.ChangeMCP; +import com.linkedin.metadata.aspect.batch.MCPItem; import com.linkedin.metadata.aspect.plugins.validation.ValidationExceptionCollection; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.util.Pair; @@ -18,6 +19,7 @@ import java.util.Objects; import java.util.Set; import java.util.stream.Collectors; +import java.util.stream.Stream; import javax.annotation.Nonnull; import lombok.Builder; import lombok.Getter; @@ -44,9 +46,20 @@ public class AspectsBatchImpl implements AspectsBatch { public Pair>, List> toUpsertBatchItems( final Map> latestAspects) { + // Process proposals to change items + Stream mutatedProposalsStream = + proposedItemsToChangeItemStream( + items.stream() + .filter(item -> item instanceof ProposedItem) + .map(item -> (MCPItem) item) + .collect(Collectors.toList())); + // Regular change items + Stream changeMCPStream = + items.stream().filter(item -> !(item instanceof ProposedItem)); + // Convert patches to upserts if needed LinkedList upsertBatchItems = - items.stream() + Stream.concat(mutatedProposalsStream, changeMCPStream) .map( item -> { final String urnStr = item.getUrn().toString(); @@ -85,6 +98,17 @@ public Pair>, List> toUpsertBatchItems( return Pair.of(newUrnAspectNames, upsertBatchItems); } + private Stream proposedItemsToChangeItemStream(List proposedItems) { + return applyProposalMutationHooks(proposedItems, retrieverContext) + .filter(mcpItem -> mcpItem.getMetadataChangeProposal() != null) + .map( + mcpItem -> + ChangeItemImpl.ChangeItemImplBuilder.build( + mcpItem.getMetadataChangeProposal(), + mcpItem.getAuditStamp(), + retrieverContext.getAspectRetriever())); + } + public static class AspectsBatchImplBuilder { /** * Just one aspect record template diff --git a/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java new file mode 100644 index 00000000000000..452ed39ddf3174 --- /dev/null +++ b/metadata-io/metadata-io-api/src/main/java/com/linkedin/metadata/entity/ebean/batch/ProposedItem.java @@ -0,0 +1,80 @@ +package com.linkedin.metadata.entity.ebean.batch; + +import com.linkedin.common.AuditStamp; +import com.linkedin.common.urn.Urn; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.aspect.batch.MCPItem; +import com.linkedin.metadata.models.AspectSpec; +import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.metadata.utils.GenericRecordUtils; +import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import lombok.Builder; +import lombok.Getter; +import lombok.extern.slf4j.Slf4j; + +/** Represents an unvalidated wrapped MCP */ +@Slf4j +@Getter +@Builder(toBuilder = true) +public class ProposedItem implements MCPItem { + @Nonnull private final MetadataChangeProposal metadataChangeProposal; + @Nonnull private final AuditStamp auditStamp; + // derived + @Nonnull private EntitySpec entitySpec; + @Nullable private AspectSpec aspectSpec; + + @Nonnull + @Override + public String getAspectName() { + if (metadataChangeProposal.getAspectName() != null) { + return metadataChangeProposal.getAspectName(); + } else { + return MCPItem.super.getAspectName(); + } + } + + @Nullable + public AspectSpec getAspectSpec() { + if (aspectSpec != null) { + return aspectSpec; + } + if (entitySpec.getAspectSpecMap().containsKey(getAspectName())) { + return entitySpec.getAspectSpecMap().get(getAspectName()); + } + return null; + } + + @Nullable + @Override + public RecordTemplate getRecordTemplate() { + if (getAspectSpec() != null) { + return GenericRecordUtils.deserializeAspect( + getMetadataChangeProposal().getAspect().getValue(), + getMetadataChangeProposal().getAspect().getContentType(), + getAspectSpec()); + } + return null; + } + + @Nonnull + @Override + public Urn getUrn() { + return metadataChangeProposal.getEntityUrn(); + } + + @Nullable + @Override + public SystemMetadata getSystemMetadata() { + return metadataChangeProposal.getSystemMetadata(); + } + + @Nonnull + @Override + public ChangeType getChangeType() { + return metadataChangeProposal.getChangeType(); + } +} diff --git a/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java new file mode 100644 index 00000000000000..d2e7243d045604 --- /dev/null +++ b/metadata-io/metadata-io-api/src/test/java/com/linkedin/metadata/entity/ebean/batch/AspectsBatchImplTest.java @@ -0,0 +1,320 @@ +package com.linkedin.metadata.entity.ebean.batch; + +import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; +import static com.linkedin.metadata.Constants.STATUS_ASPECT_NAME; +import static com.linkedin.metadata.Constants.STRUCTURED_PROPERTIES_ASPECT_NAME; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; + +import com.linkedin.common.Status; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.ByteString; +import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; +import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.GraphRetriever; +import com.linkedin.metadata.aspect.batch.MCPItem; +import com.linkedin.metadata.aspect.patch.GenericJsonPatch; +import com.linkedin.metadata.aspect.patch.PatchOperationType; +import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; +import com.linkedin.metadata.aspect.plugins.hooks.MutationHook; +import com.linkedin.metadata.entity.SearchRetriever; +import com.linkedin.metadata.models.registry.ConfigEntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.models.registry.EntityRegistryException; +import com.linkedin.metadata.models.registry.MergedEntityRegistry; +import com.linkedin.metadata.models.registry.SnapshotEntityRegistry; +import com.linkedin.metadata.snapshot.Snapshot; +import com.linkedin.metadata.utils.AuditStampUtils; +import com.linkedin.metadata.utils.GenericRecordUtils; +import com.linkedin.mxe.GenericAspect; +import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.structured.StructuredProperties; +import com.linkedin.structured.StructuredPropertyValueAssignmentArray; +import com.linkedin.util.Pair; +import io.datahubproject.metadata.context.RetrieverContext; +import java.nio.charset.StandardCharsets; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.stream.Stream; +import javax.annotation.Nonnull; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.Accessors; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.BeforeTest; +import org.testng.annotations.Test; + +public class AspectsBatchImplTest { + private EntityRegistry testRegistry; + private AspectRetriever mockAspectRetriever; + private RetrieverContext retrieverContext; + + @BeforeTest + public void beforeTest() throws EntityRegistryException { + PathSpecBasedSchemaAnnotationVisitor.class + .getClassLoader() + .setClassAssertionStatus(PathSpecBasedSchemaAnnotationVisitor.class.getName(), false); + + EntityRegistry snapshotEntityRegistry = new SnapshotEntityRegistry(); + EntityRegistry configEntityRegistry = + new ConfigEntityRegistry( + Snapshot.class.getClassLoader().getResourceAsStream("AspectsBatchImplTest.yaml")); + this.testRegistry = + new MergedEntityRegistry(snapshotEntityRegistry).apply(configEntityRegistry); + } + + @BeforeMethod + public void setup() { + this.mockAspectRetriever = mock(AspectRetriever.class); + when(this.mockAspectRetriever.getEntityRegistry()).thenReturn(testRegistry); + this.retrieverContext = + RetrieverContext.builder() + .searchRetriever(mock(SearchRetriever.class)) + .aspectRetriever(mockAspectRetriever) + .graphRetriever(mock(GraphRetriever.class)) + .build(); + } + + @Test + public void toUpsertBatchItemsChangeItemTest() { + List testItems = + List.of( + ChangeItemImpl.builder() + .urn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)")) + .changeType(ChangeType.UPSERT) + .aspectName(STATUS_ASPECT_NAME) + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectSpec( + testRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(STATUS_ASPECT_NAME)) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .recordTemplate(new Status().setRemoved(true)) + .build(mockAspectRetriever), + ChangeItemImpl.builder() + .urn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)")) + .changeType(ChangeType.UPSERT) + .aspectName(STATUS_ASPECT_NAME) + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectSpec( + testRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(STATUS_ASPECT_NAME)) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .recordTemplate(new Status().setRemoved(false)) + .build(mockAspectRetriever)); + + AspectsBatchImpl testBatch = + AspectsBatchImpl.builder().items(testItems).retrieverContext(retrieverContext).build(); + + assertEquals( + testBatch.toUpsertBatchItems(Map.of()), + Pair.of(Map.of(), testItems), + "Expected noop, pass through with no additional MCPs or changes"); + } + + @Test + public void toUpsertBatchItemsPatchItemTest() { + GenericJsonPatch.PatchOp testPatchOp = new GenericJsonPatch.PatchOp(); + testPatchOp.setOp(PatchOperationType.REMOVE.getValue()); + testPatchOp.setPath( + String.format( + "/properties/%s", "urn:li:structuredProperty:io.acryl.privacy.retentionTime")); + + List testItems = + List.of( + PatchItemImpl.builder() + .urn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)")) + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectName(STRUCTURED_PROPERTIES_ASPECT_NAME) + .aspectSpec( + testRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(STRUCTURED_PROPERTIES_ASPECT_NAME)) + .patch( + GenericJsonPatch.builder() + .arrayPrimaryKeys(Map.of("properties", List.of("propertyUrn"))) + .patch(List.of(testPatchOp)) + .build() + .getJsonPatch()) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(retrieverContext.getAspectRetriever().getEntityRegistry()), + PatchItemImpl.builder() + .urn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)")) + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectName(STRUCTURED_PROPERTIES_ASPECT_NAME) + .aspectSpec( + testRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(STRUCTURED_PROPERTIES_ASPECT_NAME)) + .patch( + GenericJsonPatch.builder() + .arrayPrimaryKeys(Map.of("properties", List.of("propertyUrn"))) + .patch(List.of(testPatchOp)) + .build() + .getJsonPatch()) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(retrieverContext.getAspectRetriever().getEntityRegistry())); + + AspectsBatchImpl testBatch = + AspectsBatchImpl.builder().items(testItems).retrieverContext(retrieverContext).build(); + + assertEquals( + testBatch.toUpsertBatchItems(Map.of()), + Pair.of( + Map.of(), + List.of( + ChangeItemImpl.builder() + .urn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)")) + .changeType(ChangeType.UPSERT) + .aspectName(STRUCTURED_PROPERTIES_ASPECT_NAME) + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectSpec( + testRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(STRUCTURED_PROPERTIES_ASPECT_NAME)) + .auditStamp(testItems.get(0).getAuditStamp()) + .recordTemplate( + new StructuredProperties() + .setProperties(new StructuredPropertyValueAssignmentArray())) + .systemMetadata(testItems.get(0).getSystemMetadata()) + .build(mockAspectRetriever), + ChangeItemImpl.builder() + .urn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)")) + .changeType(ChangeType.UPSERT) + .aspectName(STRUCTURED_PROPERTIES_ASPECT_NAME) + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectSpec( + testRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(STRUCTURED_PROPERTIES_ASPECT_NAME)) + .auditStamp(testItems.get(1).getAuditStamp()) + .recordTemplate( + new StructuredProperties() + .setProperties(new StructuredPropertyValueAssignmentArray())) + .systemMetadata(testItems.get(1).getSystemMetadata()) + .build(mockAspectRetriever))), + "Expected patch items converted to upsert change items"); + } + + @Test + public void toUpsertBatchItemsProposedItemTest() { + List testItems = + List.of( + ProposedItem.builder() + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .metadataChangeProposal( + new MetadataChangeProposal() + .setEntityUrn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)")) + .setAspectName("my-custom-aspect") + .setEntityType(DATASET_ENTITY_NAME) + .setChangeType(ChangeType.UPSERT) + .setAspect( + new GenericAspect() + .setContentType("application/json") + .setValue( + ByteString.copyString( + "{\"foo\":\"bar\"}", StandardCharsets.UTF_8))) + .setSystemMetadata(new SystemMetadata())) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build(), + ProposedItem.builder() + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .metadataChangeProposal( + new MetadataChangeProposal() + .setEntityUrn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)")) + .setAspectName("my-custom-aspect") + .setEntityType(DATASET_ENTITY_NAME) + .setChangeType(ChangeType.UPSERT) + .setAspect( + new GenericAspect() + .setContentType("application/json") + .setValue( + ByteString.copyString( + "{\"foo\":\"bar\"}", StandardCharsets.UTF_8))) + .setSystemMetadata(new SystemMetadata())) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build()); + + AspectsBatchImpl testBatch = + AspectsBatchImpl.builder().items(testItems).retrieverContext(retrieverContext).build(); + + assertEquals( + testBatch.toUpsertBatchItems(Map.of()), + Pair.of( + Map.of(), + List.of( + ChangeItemImpl.builder() + .urn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_created,PROD)")) + .changeType(ChangeType.UPSERT) + .aspectName(STATUS_ASPECT_NAME) + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectSpec( + testRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(STATUS_ASPECT_NAME)) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .systemMetadata(testItems.get(0).getSystemMetadata()) + .recordTemplate(new Status().setRemoved(false)) + .build(mockAspectRetriever), + ChangeItemImpl.builder() + .urn( + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)")) + .changeType(ChangeType.UPSERT) + .aspectName(STATUS_ASPECT_NAME) + .entitySpec(testRegistry.getEntitySpec(DATASET_ENTITY_NAME)) + .aspectSpec( + testRegistry + .getEntitySpec(DATASET_ENTITY_NAME) + .getAspectSpec(STATUS_ASPECT_NAME)) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .systemMetadata(testItems.get(1).getSystemMetadata()) + .recordTemplate(new Status().setRemoved(false)) + .build(mockAspectRetriever))), + "Mutation to status aspect"); + } + + /** Converts unsupported to status aspect */ + @Getter + @Setter + @Accessors(chain = true) + public static class TestMutator extends MutationHook { + private AspectPluginConfig config; + + @Override + protected Stream proposalMutation( + @Nonnull Collection mcpItems, + @Nonnull com.linkedin.metadata.aspect.RetrieverContext retrieverContext) { + return mcpItems.stream() + .peek( + item -> + item.getMetadataChangeProposal() + .setAspectName(STATUS_ASPECT_NAME) + .setAspect( + GenericRecordUtils.serializeAspect(new Status().setRemoved(false)))); + } + } +} diff --git a/metadata-io/metadata-io-api/src/test/resources/AspectsBatchImplTest.yaml b/metadata-io/metadata-io-api/src/test/resources/AspectsBatchImplTest.yaml new file mode 100644 index 00000000000000..9716b0cab9b2f9 --- /dev/null +++ b/metadata-io/metadata-io-api/src/test/resources/AspectsBatchImplTest.yaml @@ -0,0 +1,19 @@ +entities: + - name: dataset + doc: Datasets represent logical or physical data assets stored or represented in various data platforms. Tables, Views, Streams are all instances of datasets. + category: core + keyAspect: datasetKey + aspects: + - status + - structuredProperties +plugins: + mutationHooks: + - className: 'com.linkedin.metadata.entity.ebean.batch.AspectsBatchImplTest$TestMutator' + packageScan: + - 'com.linkedin.metadata.entity.ebean.batch' + enabled: true + supportedOperations: + - UPSERT + supportedEntityAspectNames: + - entityName: 'dataset' + aspectName: '*' \ No newline at end of file diff --git a/metadata-io/src/main/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutator.java b/metadata-io/src/main/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutator.java new file mode 100644 index 00000000000000..8d6bdffceacb93 --- /dev/null +++ b/metadata-io/src/main/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutator.java @@ -0,0 +1,80 @@ +package com.linkedin.metadata.aspect.hooks; + +import com.datahub.util.exception.ModelConversionException; +import com.linkedin.data.template.RecordTemplate; +import com.linkedin.data.transform.filter.request.MaskTree; +import com.linkedin.metadata.aspect.RetrieverContext; +import com.linkedin.metadata.aspect.batch.MCPItem; +import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; +import com.linkedin.metadata.aspect.plugins.hooks.MutationHook; +import com.linkedin.metadata.entity.validation.ValidationApiUtils; +import com.linkedin.metadata.entity.validation.ValidationException; +import com.linkedin.metadata.models.AspectSpec; +import com.linkedin.metadata.utils.GenericRecordUtils; +import com.linkedin.mxe.GenericAspect; +import com.linkedin.restli.internal.server.util.RestUtils; +import java.util.Collection; +import java.util.stream.Stream; +import javax.annotation.Nonnull; +import lombok.Getter; +import lombok.Setter; +import lombok.experimental.Accessors; +import lombok.extern.slf4j.Slf4j; + +/** This mutator will log and drop unknown aspects. It will also log and drop unknown fields. */ +@Slf4j +@Setter +@Getter +@Accessors(chain = true) +public class IgnoreUnknownMutator extends MutationHook { + @Nonnull private AspectPluginConfig config; + + @Override + protected Stream proposalMutation( + @Nonnull Collection mcpItems, @Nonnull RetrieverContext retrieverContext) { + return mcpItems.stream() + .filter( + item -> { + if (item.getEntitySpec().getAspectSpec(item.getAspectName()) == null) { + log.warn( + "Dropping unknown aspect {} on entity {}", + item.getAspectName(), + item.getAspectSpec().getName()); + return false; + } + if (!"application/json" + .equals(item.getMetadataChangeProposal().getAspect().getContentType())) { + log.warn( + "Dropping unknown content type {} for aspect {} on entity {}", + item.getMetadataChangeProposal().getAspect().getContentType(), + item.getAspectName(), + item.getEntitySpec().getName()); + return false; + } + return true; + }) + .peek( + item -> { + try { + AspectSpec aspectSpec = item.getEntitySpec().getAspectSpec(item.getAspectName()); + GenericAspect aspect = item.getMetadataChangeProposal().getAspect(); + RecordTemplate recordTemplate = + GenericRecordUtils.deserializeAspect( + aspect.getValue(), aspect.getContentType(), aspectSpec); + try { + ValidationApiUtils.validateOrThrow(recordTemplate); + } catch (ValidationException | ModelConversionException e) { + log.warn( + "Failed to validate aspect. Coercing aspect {} on entity {}", + item.getAspectName(), + item.getEntitySpec().getName()); + RestUtils.trimRecordTemplate(recordTemplate, new MaskTree(), false); + item.getMetadataChangeProposal() + .setAspect(GenericRecordUtils.serializeAspect(recordTemplate)); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + } +} diff --git a/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java new file mode 100644 index 00000000000000..11a3153abcaeed --- /dev/null +++ b/metadata-io/src/test/java/com/linkedin/metadata/aspect/hooks/IgnoreUnknownMutatorTest.java @@ -0,0 +1,143 @@ +package com.linkedin.metadata.aspect.hooks; + +import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; +import static com.linkedin.metadata.Constants.DATASET_PROPERTIES_ASPECT_NAME; +import static com.linkedin.metadata.Constants.GLOBAL_TAGS_ASPECT_NAME; +import static org.mockito.Mockito.mock; +import static org.testng.Assert.assertEquals; + +import com.linkedin.common.GlobalTags; +import com.linkedin.common.TagAssociation; +import com.linkedin.common.TagAssociationArray; +import com.linkedin.common.urn.TagUrn; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.ByteString; +import com.linkedin.data.template.StringMap; +import com.linkedin.dataset.DatasetProperties; +import com.linkedin.events.metadata.ChangeType; +import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.aspect.batch.MCPItem; +import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; +import com.linkedin.metadata.entity.SearchRetriever; +import com.linkedin.metadata.entity.ebean.batch.ProposedItem; +import com.linkedin.metadata.models.registry.EntityRegistry; +import com.linkedin.metadata.utils.AuditStampUtils; +import com.linkedin.mxe.GenericAspect; +import com.linkedin.mxe.MetadataChangeProposal; +import com.linkedin.mxe.SystemMetadata; +import com.linkedin.test.metadata.aspect.TestEntityRegistry; +import io.datahubproject.metadata.context.RetrieverContext; +import io.datahubproject.test.metadata.context.TestOperationContexts; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.List; +import java.util.Map; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class IgnoreUnknownMutatorTest { + private static final EntityRegistry TEST_REGISTRY = new TestEntityRegistry(); + private static final AspectPluginConfig TEST_PLUGIN_CONFIG = + AspectPluginConfig.builder() + .className(IgnoreUnknownMutator.class.getName()) + .enabled(true) + .supportedOperations(List.of("UPSERT")) + .supportedEntityAspectNames( + List.of( + AspectPluginConfig.EntityAspectName.builder() + .entityName(DATASET_ENTITY_NAME) + .aspectName("*") + .build())) + .build(); + private static final Urn TEST_DATASET_URN = + UrnUtils.getUrn( + "urn:li:dataset:(urn:li:dataPlatform:postgres,calm-pagoda-323403.jaffle_shop.customers,PROD)"); + private AspectRetriever mockAspectRetriever; + private RetrieverContext retrieverContext; + + @BeforeMethod + public void setup() { + mockAspectRetriever = mock(AspectRetriever.class); + retrieverContext = + RetrieverContext.builder() + .searchRetriever(mock(SearchRetriever.class)) + .aspectRetriever(mockAspectRetriever) + .graphRetriever(TestOperationContexts.emptyGraphRetriever) + .build(); + } + + @Test + public void testUnknownFieldInTagAssociationArray() throws URISyntaxException { + IgnoreUnknownMutator test = new IgnoreUnknownMutator(); + test.setConfig(TEST_PLUGIN_CONFIG); + + List testItems = + List.of( + ProposedItem.builder() + .entitySpec(TEST_REGISTRY.getEntitySpec(DATASET_ENTITY_NAME)) + .metadataChangeProposal( + new MetadataChangeProposal() + .setEntityUrn(TEST_DATASET_URN) + .setAspectName(GLOBAL_TAGS_ASPECT_NAME) + .setEntityType(DATASET_ENTITY_NAME) + .setChangeType(ChangeType.UPSERT) + .setAspect( + new GenericAspect() + .setContentType("application/json") + .setValue( + ByteString.copyString( + "{\"tags\":[{\"tag\":\"urn:li:tag:Legacy\",\"foo\":\"bar\"}]}", + StandardCharsets.UTF_8))) + .setSystemMetadata(new SystemMetadata())) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build()); + + List result = test.proposalMutation(testItems, retrieverContext).toList(); + + assertEquals(1, result.size()); + assertEquals( + result.get(0).getAspect(GlobalTags.class), + new GlobalTags() + .setTags( + new TagAssociationArray( + List.of( + new TagAssociation() + .setTag(TagUrn.createFromString("urn:li:tag:Legacy")))))); + } + + @Test + public void testUnknownFieldDatasetProperties() throws URISyntaxException { + IgnoreUnknownMutator test = new IgnoreUnknownMutator(); + test.setConfig(TEST_PLUGIN_CONFIG); + + List testItems = + List.of( + ProposedItem.builder() + .entitySpec(TEST_REGISTRY.getEntitySpec(DATASET_ENTITY_NAME)) + .metadataChangeProposal( + new MetadataChangeProposal() + .setEntityUrn(TEST_DATASET_URN) + .setAspectName(DATASET_PROPERTIES_ASPECT_NAME) + .setEntityType(DATASET_ENTITY_NAME) + .setChangeType(ChangeType.UPSERT) + .setAspect( + new GenericAspect() + .setContentType("application/json") + .setValue( + ByteString.copyString( + "{\"foo\":\"bar\",\"customProperties\":{\"prop2\":\"pikachu\",\"prop1\":\"fakeprop\"}}", + StandardCharsets.UTF_8))) + .setSystemMetadata(new SystemMetadata())) + .auditStamp(AuditStampUtils.createDefaultAuditStamp()) + .build()); + + List result = test.proposalMutation(testItems, retrieverContext).toList(); + + assertEquals(1, result.size()); + assertEquals( + result.get(0).getAspect(DatasetProperties.class), + new DatasetProperties() + .setCustomProperties(new StringMap(Map.of("prop1", "fakeprop", "prop2", "pikachu")))); + } +} diff --git a/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MaeConsumerApplication.java b/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MaeConsumerApplication.java index 9a4c01dabf9a77..f6533a6ac1d8a9 100644 --- a/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MaeConsumerApplication.java +++ b/metadata-jobs/mae-consumer-job/src/main/java/com/linkedin/metadata/kafka/MaeConsumerApplication.java @@ -34,6 +34,7 @@ "com.linkedin.gms.factory.context", "com.linkedin.gms.factory.timeseries", "com.linkedin.gms.factory.assertion", + "com.linkedin.gms.factory.plugins" }, excludeFilters = { @ComponentScan.Filter( diff --git a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java index 2666f58de862ef..f6f71a12a6951f 100644 --- a/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java +++ b/metadata-jobs/mae-consumer/src/test/java/com/linkedin/metadata/kafka/hook/spring/MCLSpringCommonTestConfiguration.java @@ -6,6 +6,7 @@ import com.datahub.authentication.Authentication; import com.datahub.metadata.ingestion.IngestionScheduler; import com.linkedin.entity.client.SystemEntityClient; +import com.linkedin.gms.factory.plugins.SpringStandardPluginConfiguration; import com.linkedin.metadata.boot.kafka.DataHubUpgradeKafkaListener; import com.linkedin.metadata.graph.elastic.ElasticSearchGraphService; import com.linkedin.metadata.models.registry.EntityRegistry; @@ -85,4 +86,6 @@ public OperationContext operationContext( indexConvention, mock(RetrieverContext.class)); } + + @MockBean SpringStandardPluginConfiguration springStandardPluginConfiguration; } diff --git a/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java b/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java index af3caecba865c1..4ea5e6ea34d5b3 100644 --- a/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java +++ b/metadata-jobs/mce-consumer-job/src/main/java/com/linkedin/metadata/kafka/MceConsumerApplication.java @@ -33,7 +33,8 @@ "com.linkedin.gms.factory.form", "com.linkedin.metadata.dao.producer", "io.datahubproject.metadata.jobs.common.health.kafka", - "com.linkedin.gms.factory.context" + "com.linkedin.gms.factory.context", + "com.linkedin.gms.factory.plugins" }, excludeFilters = { @ComponentScan.Filter( diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index c8344b7de1e127..6006ca179d162c 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -665,3 +665,9 @@ plugins: aspectName: 'schemaMetadata' - entityName: '*' aspectName: 'editableSchemaMetadata' + - className: 'com.linkedin.metadata.aspect.plugins.hooks.MutationHook' + enabled: true + spring: + enabled: true + packageScan: + - com.linkedin.gms.factory.plugins \ No newline at end of file diff --git a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java index dcea185fcbc7ca..1eee0498f112a6 100644 --- a/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java +++ b/metadata-operation-context/src/main/java/io/datahubproject/metadata/context/RequestContext.java @@ -35,6 +35,7 @@ public class RequestContext implements ContextInterface { @Nonnull private final String requestID; @Nonnull private final String userAgent; + @Builder.Default private boolean validated = true; public RequestContext( @Nonnull String actorUrn, diff --git a/metadata-service/configuration/src/main/resources/application.yaml b/metadata-service/configuration/src/main/resources/application.yaml index 599f7e7be344fd..1d5b7c7904f978 100644 --- a/metadata-service/configuration/src/main/resources/application.yaml +++ b/metadata-service/configuration/src/main/resources/application.yaml @@ -466,6 +466,8 @@ businessAttribute: keepAliveTime: ${BUSINESS_ATTRIBUTE_PROPAGATION_CONCURRENCY_KEEP_ALIVE:60} # Number of seconds to keep inactive threads alive metadataChangeProposal: + validation: + ignoreUnknown: ${MCP_VALIDATION_IGNORE_UNKNOWN:true} throttle: updateIntervalMs: ${MCP_THROTTLE_UPDATE_INTERVAL_MS:60000} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/ConfigEntityRegistryFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/ConfigEntityRegistryFactory.java index f1518f9c8f9d74..9f4dfb86c0fcd4 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/ConfigEntityRegistryFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/entityregistry/ConfigEntityRegistryFactory.java @@ -1,6 +1,7 @@ package com.linkedin.gms.factory.entityregistry; import com.datahub.plugins.metadata.aspect.SpringPluginFactory; +import com.linkedin.gms.factory.plugins.SpringStandardPluginConfiguration; import com.linkedin.metadata.aspect.plugins.PluginFactory; import com.linkedin.metadata.aspect.plugins.config.PluginConfiguration; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; @@ -29,7 +30,9 @@ public class ConfigEntityRegistryFactory { @Bean(name = "configEntityRegistry") @Nonnull - protected ConfigEntityRegistry getInstance() throws IOException, EntityRegistryException { + protected ConfigEntityRegistry getInstance( + SpringStandardPluginConfiguration springStandardPluginConfiguration) + throws IOException, EntityRegistryException { BiFunction, PluginFactory> pluginFactoryProvider = (config, loaders) -> new SpringPluginFactory(applicationContext, config, loaders); if (entityRegistryConfigPath != null) { diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/plugins/SpringStandardPluginConfiguration.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/plugins/SpringStandardPluginConfiguration.java new file mode 100644 index 00000000000000..fa4f520dc88c7c --- /dev/null +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/plugins/SpringStandardPluginConfiguration.java @@ -0,0 +1,33 @@ +package com.linkedin.gms.factory.plugins; + +import com.linkedin.metadata.aspect.hooks.IgnoreUnknownMutator; +import com.linkedin.metadata.aspect.plugins.config.AspectPluginConfig; +import com.linkedin.metadata.aspect.plugins.hooks.MutationHook; +import java.util.List; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; + +@Configuration +public class SpringStandardPluginConfiguration { + + @Value("${metadataChangeProposal.validation.ignoreUnknown}") + private boolean ignoreUnknownEnabled; + + @Bean + public MutationHook ignoreUnknownMutator() { + return new IgnoreUnknownMutator() + .setConfig( + AspectPluginConfig.builder() + .className(IgnoreUnknownMutator.class.getName()) + .enabled(ignoreUnknownEnabled) + .supportedOperations(List.of("CREATE", "CREATE_ENTITY", "UPSERT")) + .supportedEntityAspectNames( + List.of( + AspectPluginConfig.EntityAspectName.builder() + .entityName("*") + .aspectName("*") + .build())) + .build()); + } +} diff --git a/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java b/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java index 043b0016abaaae..f7e911c2629088 100644 --- a/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java +++ b/metadata-service/plugin/src/main/java/com/datahub/plugins/metadata/aspect/SpringPluginFactory.java @@ -78,6 +78,15 @@ private static Stream filterSpringConfigs( config -> config.getSpring() != null && config.getSpring().isEnabled()); } + @Nonnull + @Override + public List getClassLoaders() { + if (!super.getClassLoaders().isEmpty()) { + return super.getClassLoaders(); + } + return List.of(SpringPluginFactory.class.getClassLoader()); + } + /** * Override to inject classes from Spring * @@ -137,7 +146,8 @@ protected List build( log.warn( "Failed to load class {} from loader {}", config.getClassName(), - classLoader.getName()); + classLoader.getName(), + e); } } diff --git a/metadata-service/war/src/main/java/com/linkedin/gms/CommonApplicationConfig.java b/metadata-service/war/src/main/java/com/linkedin/gms/CommonApplicationConfig.java index c44cb4eaa1ac3b..bc623c3cc983c2 100644 --- a/metadata-service/war/src/main/java/com/linkedin/gms/CommonApplicationConfig.java +++ b/metadata-service/war/src/main/java/com/linkedin/gms/CommonApplicationConfig.java @@ -37,6 +37,7 @@ "com.linkedin.gms.factory.search", "com.linkedin.gms.factory.secret", "com.linkedin.gms.factory.timeseries", + "com.linkedin.gms.factory.plugins" }) @PropertySource(value = "classpath:/application.yaml", factory = YamlPropertySourceFactory.class) @Configuration From 4b83adfa9f33d50c92376fda12f47fb1574ba80f Mon Sep 17 00:00:00 2001 From: Patrick Franco Braz Date: Tue, 16 Jul 2024 19:50:54 -0300 Subject: [PATCH 17/18] fix(ingest/bigquery): changes helper function to decode unicode scape sequences (#10845) --- .../source/bigquery_v2/bigquery_helper.py | 19 +++++++++++--- .../unit/test_bigqueryv2_usage_source.py | 26 +++++++++++++++++++ 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py index bbdf32da13621d..507e1d917d2066 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_helper.py @@ -10,14 +10,27 @@ def unquote_and_decode_unicode_escape_seq( """ If string starts and ends with a quote, unquote it and decode Unicode escape sequences """ + unicode_seq_pattern = re.compile(r"\\(u|U)[0-9a-fA-F]{4}") trailing_quote = trailing_quote if trailing_quote else leading_quote if string.startswith(leading_quote) and string.endswith(trailing_quote): string = string[1:-1] - cleaned_string = string.encode().decode("unicode-escape") - - return cleaned_string + # Decode Unicode escape sequences. This avoid issues with encoding + # This process does not handle unicode from "\U00010000" to "\U0010FFFF" + while unicode_seq_pattern.search(string): + # Get the first Unicode escape sequence. + # mypy: unicode_seq_pattern.search(string) is not None because of the while loop + unicode_seq = unicode_seq_pattern.search(string).group(0) # type: ignore + # Replace the Unicode escape sequence with the decoded character + try: + string = string.replace( + unicode_seq, unicode_seq.encode("utf-8").decode("unicode-escape") + ) + except UnicodeDecodeError: + # Skip decoding if is not possible to decode the Unicode escape sequence + break # avoid infinite loop + return string def parse_labels(labels_str: str) -> Dict[str, str]: diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 8a3fa5ca46ea4a..21787af1b0cb9a 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -212,3 +212,29 @@ def test_unquote_and_decode_unicode_escape_seq(): expected_output = "No escape sequences here" result = unquote_and_decode_unicode_escape_seq(input_string) assert result == expected_output + + # Test with invalid Unicode escape sequences + input_string = '"No escape \\u123 sequences here"' + expected_output = "No escape \\u123 sequences here" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that has multiple Unicode escape sequences + input_string = '"Hello \\u003cWorld\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e \\u003cAgain\\u003e"' + expected_output = "Hello " + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with a string that has a Unicode escape sequence at the beginning + input_string = '"Hello \\utest"' + expected_output = "Hello \\utest" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output + + # Test with special characters + input_string = ( + '"Hello \\u003cWorld\\u003e \\u003cçãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?\\u003e"' + ) + expected_output = "Hello <çãâÁÁà|{}()[].,/;\\+=--_*&%$#@!?>" + result = unquote_and_decode_unicode_escape_seq(input_string) + assert result == expected_output From 298c299cf1ec4713031c5295f3da4af06e023836 Mon Sep 17 00:00:00 2001 From: pie1nthesky <39328908+pie1nthesky@users.noreply.github.com> Date: Wed, 17 Jul 2024 02:06:42 +0300 Subject: [PATCH 18/18] feat(ingest/postgres): fetch table sizes for profile (#10864) --- .../src/datahub/ingestion/source/sql/postgres.py | 16 ++++++++++++++++ .../postgres_all_db_mces_with_db_golden.json | 3 ++- .../postgres/postgres_mces_with_db_golden.json | 3 ++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py index 0589a5e39d68e3..12c98ef11a654d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py @@ -276,3 +276,19 @@ def get_identifier( return f"{self.config.database}.{regular}" current_database = self.get_db_name(inspector) return f"{current_database}.{regular}" + + def add_profile_metadata(self, inspector: Inspector) -> None: + try: + with inspector.engine.connect() as conn: + for row in conn.execute( + """SELECT table_catalog, table_schema, table_name, pg_table_size('"' || table_catalog || '"."' || table_schema || '"."' || table_name || '"') AS table_size FROM information_schema.TABLES""" + ): + self.profile_metadata_info.dataset_name_to_storage_bytes[ + self.get_identifier( + schema=row.table_schema, + entity=row.table_name, + inspector=inspector, + ) + ] = row.table_size + except Exception as e: + logger.error(f"failed to fetch profile metadata: {e}") diff --git a/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json b/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json index b9b2a3b2141a8c..f35ff9fdb9d153 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json @@ -832,7 +832,8 @@ { "fieldPath": "metadata_json" } - ] + ], + "sizeInBytes": 16384 } }, "systemMetadata": { diff --git a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json index 832b46e096ae00..f47789fc470cd8 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json @@ -600,7 +600,8 @@ }, "rowCount": 2, "columnCount": 9, - "fieldProfiles": [] + "fieldProfiles": [], + "sizeInBytes": 16384 } }, "systemMetadata": {