From 0667470b9ebfa075f50df9157c0c6bb53dccef30 Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Thu, 1 Aug 2024 00:09:06 +0530 Subject: [PATCH 01/16] fix(ingestion/lookml): emit dummy sql condition for lookml custom condition tag (#11008) Co-authored-by: Harshal Sheth --- .../source/looker/looker_liquid_tag.py | 17 +- .../source/looker/looker_template_language.py | 29 +- .../source/looker/lookml_concept_context.py | 41 +-- .../ingestion/source/looker/lookml_config.py | 6 +- .../ingestion/source/looker/lookml_source.py | 3 +- .../ingestion/source/looker/view_upstream.py | 71 +++-- .../integration/lookml/expected_output.json | 2 +- .../lookml/lookml_mces_api_bigquery.json | 2 +- .../lookml/lookml_mces_api_hive2.json | 2 +- .../lookml/lookml_mces_badsql_parser.json | 90 ++++++ .../lookml/lookml_mces_offline.json | 90 ++++++ ...lookml_mces_offline_platform_instance.json | 90 ++++++ .../lookml_mces_with_external_urls.json | 90 ++++++ .../lookml/refinements_ingestion_golden.json | 2 +- .../data.model.lkml | 4 + .../employee_salary_rating.view.lkml | 50 ++++ .../vv_lineage_liquid_template_golden.json | 282 +++++++++++++++++- 17 files changed, 805 insertions(+), 66 deletions(-) create mode 100644 metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_salary_rating.view.lkml diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_liquid_tag.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_liquid_tag.py index 35231d273fbba4..7d4ebf00cc06ef 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_liquid_tag.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_liquid_tag.py @@ -1,5 +1,5 @@ from functools import lru_cache -from typing import ClassVar, Optional, TextIO, cast +from typing import ClassVar, Optional, TextIO from liquid import Environment from liquid.ast import Node @@ -25,18 +25,9 @@ def __init__(self, tok: Token, sql_or_lookml_reference: str, filter_name: str): self.filter_name = filter_name def render_to_output(self, context: Context, buffer: TextIO) -> Optional[bool]: - filter_value: Optional[str] = cast( - str, context.globals.get(self.filter_name) - ) # to silent lint - - if filter_value is None: - raise CustomTagException( - f'filter {self.filter_name} value is not provided for "condition" tag' - ) - - filter_value = filter_value.strip() - - buffer.write(f"{self.sql_or_lookml_reference}='{filter_value}'") + # This implementation will make sure that sql parse work correctly if looker condition tag + # is used in lookml sql field + buffer.write(f"{self.sql_or_lookml_reference}='dummy_value'") return True diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py index 919d9232a18c5c..2c523fcd98d08c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_template_language.py @@ -9,6 +9,7 @@ CustomTagException, create_template, ) +from datahub.ingestion.source.looker.lookml_config import DERIVED_VIEW_PATTERN from datahub.ingestion.source.looker.str_functions import ( remove_extra_spaces_and_newlines, ) @@ -94,6 +95,24 @@ def resolve_liquid_variable(text: str, liquid_variable: Dict[Any, Any]) -> str: return text +def _complete_incomplete_sql(raw_view: dict, sql: str) -> str: + + # Looker supports sql fragments that omit the SELECT and FROM parts of the query + # Add those in if we detect that it is missing + sql_query: str = sql + + if not re.search(r"SELECT\s", sql_query, flags=re.I): + # add a SELECT clause at the beginning + sql_query = f"SELECT {sql}" + + if not re.search(r"FROM\s", sql_query, flags=re.I): + # add a FROM clause at the end + sql_query = f"{sql_query} FROM {raw_view['name']}" + + # Drop ${ and } + return re.sub(DERIVED_VIEW_PATTERN, r"\1", sql_query) + + def resolve_liquid_variable_in_view_dict( raw_view: dict, liquid_variable: Dict[Any, Any] ) -> None: @@ -102,14 +121,18 @@ def resolve_liquid_variable_in_view_dict( for view in raw_view["views"]: if "sql_table_name" in view: - view["sql_table_name"] = resolve_liquid_variable( + view["datahub_transformed_sql_table_name"] = resolve_liquid_variable( text=remove_extra_spaces_and_newlines(view["sql_table_name"]), liquid_variable=liquid_variable, - ) + ) # keeping original sql_table_name as is to avoid any visualization issue later if "derived_table" in view and "sql" in view["derived_table"]: # In sql we don't need to remove the extra spaces as sql parser takes care of extra spaces and \n # while generating URN from sql - view["derived_table"]["sql"] = resolve_liquid_variable( + view["derived_table"]["datahub_transformed_sql"] = resolve_liquid_variable( text=view["derived_table"]["sql"], liquid_variable=liquid_variable + ) # keeping original sql as is, so that on UI sql will be shown same is it is visible on looker portal + + view["derived_table"]["datahub_transformed_sql"] = _complete_incomplete_sql( + raw_view=view, sql=view["derived_table"]["datahub_transformed_sql"] ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py index e528e578dcf9fa..a83aa2638ec964 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_concept_context.py @@ -266,15 +266,25 @@ def sql_table_name(self) -> str: sql_table_name: Optional[str] = self._get_sql_table_name_field() # if sql_table_name field is not set then the table name is equal to view-name if sql_table_name is None: - return self.raw_view[NAME].lower() + sql_table_name = self.raw_view[NAME].lower() + + return sql_table_name + + def datahub_transformed_sql_table_name(self) -> str: + table_name: Optional[str] = self.raw_view.get( + "datahub_transformed_sql_table_name" + ) + + if not table_name: + table_name = self.sql_table_name() # sql_table_name is in the format "${view-name}.SQL_TABLE_NAME" # remove extra characters if self._is_dot_sql_table_name_present(): - sql_table_name = re.sub(DERIVED_VIEW_PATTERN, r"\1", sql_table_name) + table_name = re.sub(DERIVED_VIEW_PATTERN, r"\1", table_name) # Some sql_table_name fields contain quotes like: optimizely."group", just remove the quotes - return sql_table_name.replace('"', "").replace("`", "").lower() + return table_name.replace('"', "").replace("`", "").lower() def derived_table(self) -> Dict[Any, Any]: """ @@ -296,30 +306,21 @@ def explore_source(self) -> Dict[Any, Any]: return derived_table["explore_source"] - def sql(self, transformed: bool = True) -> str: + def sql(self) -> str: """ This function should only be called if is_sql_based_derived_case return true """ derived_table = self.derived_table() - # Looker supports sql fragments that omit the SELECT and FROM parts of the query - # Add those in if we detect that it is missing - sql_query: str = derived_table["sql"] - - if transformed: # update the original sql attribute only if transformed is true - if not re.search(r"SELECT\s", sql_query, flags=re.I): - # add a SELECT clause at the beginning - sql_query = f"SELECT {sql_query}" + return derived_table["sql"] - if not re.search(r"FROM\s", sql_query, flags=re.I): - # add a FROM clause at the end - sql_query = f"{sql_query} FROM {self.name()}" - # Get the list of tables in the query - - # Drop ${ and } - sql_query = re.sub(DERIVED_VIEW_PATTERN, r"\1", sql_query) + def datahub_transformed_sql(self) -> str: + """ + This function should only be called if is_sql_based_derived_case return true + """ + derived_table = self.derived_table() - return sql_query + return derived_table["datahub_transformed_sql"] def name(self) -> str: return self.raw_view[NAME] diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py index aa5719547c03ed..f4fb1316b16a20 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py @@ -11,8 +11,10 @@ from datahub.configuration.git import GitInfo from datahub.configuration.source_common import EnvConfigMixin from datahub.configuration.validate_field_rename import pydantic_renamed_field -from datahub.ingestion.source.looker.looker_config import LookerCommonConfig -from datahub.ingestion.source.looker.looker_connection import LookerConnectionDefinition +from datahub.ingestion.source.looker.looker_config import ( + LookerCommonConfig, + LookerConnectionDefinition, +) from datahub.ingestion.source.looker.looker_lib_wrapper import ( LookerAPI, LookerAPIConfig, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index 223d168dbe033a..d77e65ac733232 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -201,8 +201,7 @@ def from_looker_dict( view_logic = view_context.view_file.raw_file_content[:max_file_snippet_length] if view_context.is_sql_based_derived_case(): - view_logic = view_context.sql(transformed=False) - # Parse SQL to extract dependencies. + view_logic = view_context.sql() view_details = ViewProperties( materialized=False, viewLogic=view_logic, diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py index 390e71ef9d4bd8..98646e19a7014b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/view_upstream.py @@ -206,6 +206,7 @@ class AbstractViewUpstream(ABC): view_context: LookerViewContext looker_view_id_cache: LookerViewIdCache config: LookMLSourceConfig + reporter: LookMLSourceReport ctx: PipelineContext def __init__( @@ -213,11 +214,13 @@ def __init__( view_context: LookerViewContext, looker_view_id_cache: LookerViewIdCache, config: LookMLSourceConfig, + reporter: LookMLSourceReport, ctx: PipelineContext, ): self.view_context = view_context self.looker_view_id_cache = looker_view_id_cache self.config = config + self.reporter = reporter self.ctx = ctx @abstractmethod @@ -244,9 +247,10 @@ def __init__( view_context: LookerViewContext, looker_view_id_cache: LookerViewIdCache, config: LookMLSourceConfig, + reporter: LookMLSourceReport, ctx: PipelineContext, ): - super().__init__(view_context, looker_view_id_cache, config, ctx) + super().__init__(view_context, looker_view_id_cache, config, reporter, ctx) # These are the function where we need to catch the response once calculated self._get_spr = lru_cache(maxsize=1)(self.__get_spr) self._get_upstream_dataset_urn = lru_cache(maxsize=1)( @@ -259,7 +263,7 @@ def __get_spr(self) -> Optional[SqlParsingResult]: return None spr = create_lineage_sql_parsed_result( - query=self.view_context.sql(), + query=self.view_context.datahub_transformed_sql(), default_schema=self.view_context.view_connection.default_schema, default_db=self.view_context.view_connection.default_db, platform=self.view_context.view_connection.platform, @@ -267,17 +271,6 @@ def __get_spr(self) -> Optional[SqlParsingResult]: env=self.view_context.view_connection.platform_env or self.config.env, graph=self.ctx.graph, ) - - if ( - spr.debug_info.table_error is not None - or spr.debug_info.column_error is not None - ): - logging.debug( - f"Failed to parsed the sql query. table_error={spr.debug_info.table_error} and " - f"column_error={spr.debug_info.column_error}" - ) - return None - return spr def __get_upstream_dataset_urn(self) -> List[Urn]: @@ -286,6 +279,15 @@ def __get_upstream_dataset_urn(self) -> List[Urn]: if sql_parsing_result is None: return [] + if sql_parsing_result.debug_info.table_error is not None: + self.reporter.report_warning( + title="Table Level Lineage Missing", + message="Error in parsing derived sql", + context=f"View-name: {self.view_context.name()}", + exc=sql_parsing_result.debug_info.table_error, + ) + return [] + upstream_dataset_urns: List[str] = [ _drop_hive_dot(urn) for urn in sql_parsing_result.in_tables ] @@ -306,6 +308,15 @@ def create_fields(self) -> List[ViewField]: if spr is None: return [] + if spr.debug_info.column_error is not None: + self.reporter.report_warning( + title="Column Level Lineage Missing", + message="Error in parsing derived sql for CLL", + context=f"View-name: {self.view_context.name()}", + exc=spr.debug_info.column_error, + ) + return [] + fields: List[ViewField] = [] column_lineages: List[ColumnLineageInfo] = ( @@ -336,6 +347,15 @@ def get_upstream_column_ref( if sql_parsing_result is None: return [] + if sql_parsing_result.debug_info.column_error is not None: + self.reporter.report_warning( + title="Column Level Lineage Missing", + message="Error in parsing derived sql for CLL", + context=f"View-name: {self.view_context.name()}. " + f"Error: {sql_parsing_result.debug_info.column_error}", + ) + return [] + upstreams_column_refs: List[ColumnRef] = [] if sql_parsing_result.column_lineage: for cll in sql_parsing_result.column_lineage: @@ -384,9 +404,11 @@ def __init__( view_context: LookerViewContext, looker_view_id_cache: LookerViewIdCache, config: LookMLSourceConfig, + reporter: LookMLSourceReport, ctx: PipelineContext, ): - super().__init__(view_context, looker_view_id_cache, config, ctx) + super().__init__(view_context, looker_view_id_cache, config, reporter, ctx) + self._get_upstream_dataset_urn = lru_cache(maxsize=1)( self.__get_upstream_dataset_urn ) @@ -402,7 +424,7 @@ def __get_upstream_dataset_urn(self) -> List[str]: base_folder_path=self.view_context.base_folder_path, ) - # Current view will always be present in cache. The assert will silence the lint + # Current view will always be present in cache. assert will silence the lint assert current_view_id # We're creating a "LookerExplore" just to use the urn generator. @@ -467,9 +489,10 @@ def __init__( view_context: LookerViewContext, looker_view_id_cache: LookerViewIdCache, config: LookMLSourceConfig, + reporter: LookMLSourceReport, ctx: PipelineContext, ): - super().__init__(view_context, looker_view_id_cache, config, ctx) + super().__init__(view_context, looker_view_id_cache, config, reporter, ctx) self.upstream_dataset_urn = None self._get_upstream_dataset_urn = lru_cache(maxsize=1)( @@ -478,9 +501,9 @@ def __init__( def __get_upstream_dataset_urn(self) -> Urn: # In regular case view's upstream dataset is either same as view-name or mentioned in "sql_table_name" field - # view_context.sql_table_name() handle this condition to return dataset name + # view_context.datahub_transformed_sql_table_name() handle this condition to return dataset name qualified_table_name: str = _generate_fully_qualified_name( - sql_table_name=self.view_context.sql_table_name(), + sql_table_name=self.view_context.datahub_transformed_sql_table_name(), connection_def=self.view_context.view_connection, reporter=self.view_context.reporter, ) @@ -522,9 +545,10 @@ def __init__( view_context: LookerViewContext, looker_view_id_cache: LookerViewIdCache, config: LookMLSourceConfig, + reporter: LookMLSourceReport, ctx: PipelineContext, ): - super().__init__(view_context, looker_view_id_cache, config, ctx) + super().__init__(view_context, looker_view_id_cache, config, reporter, ctx) self.upstream_dataset_urn = [] self._get_upstream_dataset_urn = lru_cache(maxsize=1)( @@ -532,10 +556,10 @@ def __init__( ) def __get_upstream_dataset_urn(self) -> List[Urn]: - # In this case view_context.sql_table_name() refers to derived view name + # In this case view_context.datahub_transformed_sql_table_name() refers to derived view name looker_view_id = get_derived_looker_view_id( qualified_table_name=_generate_fully_qualified_name( - self.view_context.sql_table_name(), + self.view_context.datahub_transformed_sql_table_name(), self.view_context.view_connection, self.view_context.reporter, ), @@ -591,6 +615,7 @@ def create_view_upstream( return RegularViewUpstream( view_context=view_context, config=config, + reporter=reporter, ctx=ctx, looker_view_id_cache=looker_view_id_cache, ) @@ -599,6 +624,7 @@ def create_view_upstream( return DotSqlTableNameViewUpstream( view_context=view_context, config=config, + reporter=reporter, ctx=ctx, looker_view_id_cache=looker_view_id_cache, ) @@ -610,6 +636,7 @@ def create_view_upstream( return SqlBasedDerivedViewUpstream( view_context=view_context, config=config, + reporter=reporter, ctx=ctx, looker_view_id_cache=looker_view_id_cache, ) @@ -618,6 +645,7 @@ def create_view_upstream( return NativeDerivedViewUpstream( view_context=view_context, config=config, + reporter=reporter, ctx=ctx, looker_view_id_cache=looker_view_id_cache, ) @@ -631,6 +659,7 @@ def create_view_upstream( return EmptyImplementation( view_context=view_context, config=config, + reporter=reporter, ctx=ctx, looker_view_id_cache=looker_view_id_cache, ) diff --git a/metadata-ingestion/tests/integration/lookml/expected_output.json b/metadata-ingestion/tests/integration/lookml/expected_output.json index d870c6dee40655..f42c600281ccb3 100644 --- a/metadata-ingestion/tests/integration/lookml/expected_output.json +++ b/metadata-ingestion/tests/integration/lookml/expected_output.json @@ -1632,7 +1632,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n order.region='ap-south-1'\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", "viewLanguage": "sql" } }, diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json index 8813ea532fa2b5..5f9b99ebe30623 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json @@ -1632,7 +1632,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n order.region='ap-south-1'\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", "viewLanguage": "sql" } }, diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json index 4bc1a0f2f7da58..1b95959f0ba1d2 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json @@ -1632,7 +1632,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n order.region='ap-south-1'\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", "viewLanguage": "sql" } }, diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json index 3fd37c47221858..fd479a2baa7226 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json @@ -1675,6 +1675,96 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),customer_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),sale_price)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),lifetime_spend)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "customer_facts", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lifetime_spend", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json index 3fd37c47221858..fd479a2baa7226 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json @@ -1675,6 +1675,96 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),customer_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),sale_price)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),lifetime_spend)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "customer_facts", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lifetime_spend", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json index bb8a379fdde224..053e90d473c1ba 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json @@ -1675,6 +1675,96 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.order,DEV)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.order,DEV),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),customer_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,warehouse.default_db.default_schema.order,DEV),sale_price)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),lifetime_spend)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "customer_facts", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lifetime_spend", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json index b8a2bcc020c34d..44dd72e8fc41be 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json @@ -1683,6 +1683,96 @@ "removed": false } }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),customer_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),customer_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,default_db.default_schema.order,PROD),sale_price)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.customer_facts,PROD),lifetime_spend)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "customer_facts", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "customer_id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lifetime_spend", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NullType": {} + } + }, + "nativeDataType": "unknown", + "recursive": false, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { diff --git a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json index 7265ee3c6c62b9..7c2f92ac1e028c 100644 --- a/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json +++ b/metadata-ingestion/tests/integration/lookml/refinements_ingestion_golden.json @@ -1656,7 +1656,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n order.region='ap-south-1'\n GROUP BY 1", + "viewLogic": "SELECT\n customer_id,\n SUM(sale_price) AS lifetime_spend\n FROM\n order\n WHERE\n {% condition order_region %} order.region {% endcondition %}\n GROUP BY 1", "viewLanguage": "sql" } }, diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml index ea55512c5ca06e..6eb92d749c9f7f 100644 --- a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/data.model.lkml @@ -5,6 +5,7 @@ include: "employee_income_source.view.lkml" include: "employee_total_income.view.lkml" include: "top_10_employee_income_source.view.lkml" include: "employee_tax_report.view.lkml" +include: "employee_salary_rating.view.lkml" explore: activity_logs { } @@ -19,4 +20,7 @@ explore: top_10_employee_income_source { } explore: employee_tax_report { +} + +explore: employee_salary_rating { } \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_salary_rating.view.lkml b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_salary_rating.view.lkml new file mode 100644 index 00000000000000..3a00099e7998e9 --- /dev/null +++ b/metadata-ingestion/tests/integration/lookml/vv-lineage-and-liquid-templates/employee_salary_rating.view.lkml @@ -0,0 +1,50 @@ +view: employee_salary_rating { + derived_table: { + sql: SELECT + employee_id, + employee_name, + {% if dw_eff_dt_date._is_selected or finance_dw_eff_dt_date._is_selected %} + prod_core.data.r_metric_summary_v2 + {% elsif dw_eff_dt_week._is_selected or finance_dw_eff_dt_week._is_selected %} + prod_core.data.r_metric_summary_v3 + {% else %} + 'default_table' as source + {% endif %}, + employee_income + FROM source_table + WHERE + {% condition source_region %} source_table.region {% endcondition %} AND + {% if rating_window._is_filtered %} + {% condition rating_window %} DATE (rating_created) {% endcondition %} + {% endif %} + ;; + } + + filter: rating_window { + description: "Date window in which to look for rating" + default_value: "90 days ago for 90 days" + datatype: date + type: date + } + + dimension: id { + type: number + sql: ${TABLE}.employee_id;; + } + + dimension: name { + type: string + sql: ${TABLE}.employee_name;; + } + + dimension: source { + type: string + sql: ${TABLE}.source ;; + } + + dimension: income { + type: number + sql: ${TABLE}.employee_income ;; + } + +} \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json b/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json index 75cd50c5c6059e..d12ced5e425066 100644 --- a/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json +++ b/metadata-ingestion/tests/integration/lookml/vv_lineage_liquid_template_golden.json @@ -302,7 +302,7 @@ "aspect": { "json": { "materialized": false, - "viewLogic": "SELECT\n employee_id,\n employee_name,\n \n prod_core.data.r_metric_summary_v2\n ,\n employee_income\n FROM source_table\n WHERE\n source_table.region='ap-south-1'", + "viewLogic": "SELECT\n employee_id,\n employee_name,\n {% if dw_eff_dt_date._is_selected or finance_dw_eff_dt_date._is_selected %}\n prod_core.data.r_metric_summary_v2\n {% elsif dw_eff_dt_week._is_selected or finance_dw_eff_dt_week._is_selected %}\n prod_core.data.r_metric_summary_v3\n {% else %}\n 'default_table' as source\n {% endif %},\n employee_income\n FROM source_table\n WHERE\n {% condition source_region %} source_table.region {% endcondition %}", "viewLanguage": "sql" } }, @@ -1300,6 +1300,286 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_salary_rating,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_salary_rating,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT\n employee_id,\n employee_name,\n {% if dw_eff_dt_date._is_selected or finance_dw_eff_dt_date._is_selected %}\n prod_core.data.r_metric_summary_v2\n {% elsif dw_eff_dt_week._is_selected or finance_dw_eff_dt_week._is_selected %}\n prod_core.data.r_metric_summary_v3\n {% else %}\n 'default_table' as source\n {% endif %},\n employee_income\n FROM source_table\n WHERE\n {% condition source_region %} source_table.region {% endcondition %} AND\n {% if rating_window._is_filtered %}\n {% condition rating_window %} DATE (rating_created) {% endcondition %}\n {% endif %}", + "viewLanguage": "sql" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_salary_rating,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_salary_rating,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.BrowsePaths": { + "paths": [ + "/Develop/lkml_samples/" + ] + } + }, + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.UpstreamLineage": { + "upstreams": [ + { + "auditStamp": { + "time": 1586847600000, + "actor": "urn:li:corpuser:datahub" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD),employee_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_salary_rating,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD),employee_name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_salary_rating,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD),source)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_salary_rating,PROD),source)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,source_table,PROD),employee_income)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_salary_rating,PROD),income)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "employee_salary_rating", + "platform": "urn:li:dataPlatform:looker", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.OtherSchema": { + "rawSchema": "" + } + }, + "fields": [ + { + "fieldPath": "id", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "source", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + }, + { + "fieldPath": "income", + "nullable": false, + "description": "", + "label": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "number", + "recursive": false, + "globalTags": { + "tags": [ + { + "tag": "urn:li:tag:Dimension" + } + ] + }, + "isPartOfKey": false + } + ], + "primaryKeys": [] + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "looker.file.path": "employee_salary_rating.view.lkml", + "looker.model": "data" + }, + "name": "employee_salary_rating", + "tags": [] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:looker,lkml_samples.view.employee_salary_rating,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "Develop" + }, + { + "id": "urn:li:container:78f22c19304954b15e8adb1d9809975e", + "urn": "urn:li:container:78f22c19304954b15e8adb1d9809975e" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "lookml-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "tag", "entityUrn": "urn:li:tag:Dimension", From dffdef2eaabac2241adc1da45bc071c0b4b06cbd Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Thu, 1 Aug 2024 00:10:09 +0530 Subject: [PATCH 02/16] fix(ingestion/powerbi): fix issue with broken report lineage (#10910) --- .../api/incremental_lineage_helper.py | 45 + .../ingestion/source/powerbi/config.py | 5 + .../ingestion/source/powerbi/powerbi.py | 9 +- .../src/datahub/specific/chart.py | 108 ++ .../src/datahub/specific/dashboard.py | 122 ++ .../golden_test_admin_access_not_allowed.json | 39 + .../powerbi/golden_test_admin_only.json | 78 ++ .../integration/powerbi/golden_test_cll.json | 39 + .../powerbi/golden_test_container.json | 117 ++ .../golden_test_disabled_ownership.json | 39 + .../powerbi/golden_test_endorsement.json | 39 + .../powerbi/golden_test_ingest.json | 39 + .../golden_test_ingest_patch_disabled.json | 1153 +++++++++++++++++ .../powerbi/golden_test_lineage.json | 39 + .../golden_test_lower_case_urn_ingest.json | 39 + ..._config_and_modified_since_admin_only.json | 78 ++ .../golden_test_platform_instance_ingest.json | 39 + .../powerbi/golden_test_report.json | 93 ++ .../golden_test_scan_all_workspaces.json | 63 + ...lden_test_server_to_platform_instance.json | 39 + .../tests/integration/powerbi/test_powerbi.py | 46 + .../test_incremental_lineage_helper.py | 40 + 22 files changed, 2302 insertions(+), 6 deletions(-) create mode 100644 metadata-ingestion/tests/integration/powerbi/golden_test_ingest_patch_disabled.json create mode 100644 metadata-ingestion/tests/unit/utilities/test_incremental_lineage_helper.py diff --git a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py index 29e1f63dd452ef..78a091f1ffe689 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py +++ b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py @@ -1,3 +1,4 @@ +import logging from typing import Iterable, Optional from pydantic.fields import Field @@ -18,6 +19,8 @@ from datahub.specific.dashboard import DashboardPatchBuilder from datahub.specific.dataset import DatasetPatchBuilder +logger = logging.getLogger(__name__) + def convert_upstream_lineage_to_patch( urn: str, @@ -48,6 +51,20 @@ def convert_chart_info_to_patch( for inputEdge in aspect.inputEdges: patch_builder.add_input_edge(inputEdge) + patch_builder.set_chart_url(aspect.chartUrl).set_external_url( + aspect.externalUrl + ).set_type(aspect.type).set_title(aspect.title).set_access( + aspect.access + ).set_last_modified( + aspect.lastModified + ).set_last_refreshed( + aspect.lastRefreshed + ).set_description( + aspect.description + ).add_inputs( + aspect.inputs + ) + values = patch_builder.build() if values: mcp = next(iter(values)) @@ -76,8 +93,36 @@ def convert_dashboard_info_to_patch( for chartEdge in aspect.chartEdges: patch_builder.add_chart_edge(chartEdge) + if aspect.title: + patch_builder.set_title(aspect.title) + + if aspect.description: + patch_builder.set_description(aspect.description) + + if aspect.charts: + patch_builder.add_charts(aspect.charts) + + if aspect.dashboardUrl: + patch_builder.set_dashboard_url(aspect.dashboardUrl) + + if aspect.datasets: + patch_builder.add_datasets(aspect.datasets) + + if aspect.access: + patch_builder.set_access(aspect.access) + + if aspect.lastRefreshed: + patch_builder.set_last_refreshed(aspect.lastRefreshed) + + if aspect.lastModified: + patch_builder.set_last_modified(last_modified=aspect.lastModified) + values = patch_builder.build() + if values: + logger.debug( + f"Generating patch DashboardInfo MetadataWorkUnit for dashboard {aspect.title}" + ) mcp = next(iter(values)) return MetadataWorkUnit( id=MetadataWorkUnit.generate_workunit_id(mcp), mcp_raw=mcp diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index bd80433bc2e6ca..967dd5d81112d5 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -440,6 +440,11 @@ class PowerBiDashboardSourceConfig( ) profiling: PowerBiProfilingConfig = PowerBiProfilingConfig() + patch_metadata: bool = pydantic.Field( + default=True, + description="Patch dashboard metadata", + ) + @root_validator(skip_on_failure=True) def validate_extract_column_level_lineage(cls, values: Dict) -> Dict: flags = [ diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index de4eaf6b64434f..73f242a06b1d67 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -1197,8 +1197,7 @@ def report_to_datahub_work_units( ) -> Iterable[MetadataWorkUnit]: mcps: List[MetadataChangeProposalWrapper] = [] - logger.debug(f"Converting dashboard={report.name} to datahub dashboard") - + logger.debug(f"Converting report={report.name} to datahub dashboard") # Convert user to CorpUser user_mcps = self.to_datahub_users(report.users) # Convert pages to charts. A report has single dataset and same dataset used in pages to create visualization @@ -1215,9 +1214,7 @@ def report_to_datahub_work_units( mcps.extend(chart_mcps) mcps.extend(report_mcps) - # Convert MCP to work_units - work_units = map(self._to_work_unit, mcps) - return work_units + return map(self._to_work_unit, mcps) @platform_name("PowerBI") @@ -1385,7 +1382,7 @@ def _get_dashboard_patch_work_unit( DashboardInfoClass ] = work_unit.get_aspect_of_type(DashboardInfoClass) - if dashboard_info_aspect: + if dashboard_info_aspect and self.source_config.patch_metadata: return convert_dashboard_info_to_patch( work_unit.get_urn(), dashboard_info_aspect, diff --git a/metadata-ingestion/src/datahub/specific/chart.py b/metadata-ingestion/src/datahub/specific/chart.py index 51994ad1d063eb..cc68168b68db7e 100644 --- a/metadata-ingestion/src/datahub/specific/chart.py +++ b/metadata-ingestion/src/datahub/specific/chart.py @@ -3,8 +3,11 @@ from datahub.emitter.mcp_patch_builder import MetadataPatchProposal from datahub.metadata.schema_classes import ( + AccessLevelClass, AuditStampClass, + ChangeAuditStampsClass, ChartInfoClass as ChartInfo, + ChartTypeClass, EdgeClass as Edge, GlobalTagsClass as GlobalTags, GlossaryTermAssociationClass as Term, @@ -311,3 +314,108 @@ def remove_custom_property(self, key: str) -> "ChartPatchBuilder": """ self.custom_properties_patch_helper.remove_property(key) return self + + def set_title(self, title: str) -> "ChartPatchBuilder": + assert title, "ChartInfo title should not be None" + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/title", + value=title, + ) + + return self + + def set_description(self, description: str) -> "ChartPatchBuilder": + assert description, "DashboardInfo description should not be None" + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/description", + value=description, + ) + + return self + + def set_last_refreshed(self, last_refreshed: Optional[int]) -> "ChartPatchBuilder": + if last_refreshed: + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/lastRefreshed", + value=last_refreshed, + ) + + return self + + def set_last_modified( + self, last_modified: "ChangeAuditStampsClass" + ) -> "ChartPatchBuilder": + if last_modified: + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/lastModified", + value=last_modified, + ) + + return self + + def set_external_url(self, external_url: Optional[str]) -> "ChartPatchBuilder": + if external_url: + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/externalUrl", + value=external_url, + ) + return self + + def set_chart_url(self, dashboard_url: Optional[str]) -> "ChartPatchBuilder": + if dashboard_url: + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/chartUrl", + value=dashboard_url, + ) + + return self + + def set_type( + self, type: Union[None, Union[str, "ChartTypeClass"]] = None + ) -> "ChartPatchBuilder": + if type: + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/type", + value=type, + ) + + return self + + def set_access( + self, access: Union[None, Union[str, "AccessLevelClass"]] = None + ) -> "ChartPatchBuilder": + if access: + self._add_patch( + ChartInfo.ASPECT_NAME, + "add", + path="/access", + value=access, + ) + + return self + + def add_inputs(self, input_urns: Optional[List[str]]) -> "ChartPatchBuilder": + if input_urns: + for urn in input_urns: + self._add_patch( + aspect_name=ChartInfo.ASPECT_NAME, + op="add", + path=f"/inputs/{urn}", + value=urn, + ) + + return self diff --git a/metadata-ingestion/src/datahub/specific/dashboard.py b/metadata-ingestion/src/datahub/specific/dashboard.py index e6d911b5986550..8228dbc011db2f 100644 --- a/metadata-ingestion/src/datahub/specific/dashboard.py +++ b/metadata-ingestion/src/datahub/specific/dashboard.py @@ -3,7 +3,9 @@ from datahub.emitter.mcp_patch_builder import MetadataPatchProposal from datahub.metadata.schema_classes import ( + AccessLevelClass, AuditStampClass, + ChangeAuditStampsClass, DashboardInfoClass as DashboardInfo, EdgeClass as Edge, GlobalTagsClass as GlobalTags, @@ -405,3 +407,123 @@ def remove_custom_property(self, key: str) -> "DashboardPatchBuilder": """ self.custom_properties_patch_helper.remove_property(key) return self + + def set_title(self, title: str) -> "DashboardPatchBuilder": + assert title, "DashboardInfo title should not be None" + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/title", + value=title, + ) + + return self + + def set_description(self, description: str) -> "DashboardPatchBuilder": + assert description, "DashboardInfo description should not be None" + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/description", + value=description, + ) + + return self + + def add_custom_properties( + self, custom_properties: Optional[Dict[str, str]] = None + ) -> "DashboardPatchBuilder": + + if custom_properties: + for key, value in custom_properties.items(): + self.custom_properties_patch_helper.add_property(key, value) + + return self + + def set_external_url(self, external_url: Optional[str]) -> "DashboardPatchBuilder": + if external_url: + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/externalUrl", + value=external_url, + ) + return self + + def add_charts(self, chart_urns: Optional[List[str]]) -> "DashboardPatchBuilder": + if chart_urns: + for urn in chart_urns: + self._add_patch( + aspect_name=DashboardInfo.ASPECT_NAME, + op="add", + path=f"/charts/{urn}", + value=urn, + ) + + return self + + def add_datasets( + self, dataset_urns: Optional[List[str]] + ) -> "DashboardPatchBuilder": + if dataset_urns: + for urn in dataset_urns: + self._add_patch( + aspect_name=DashboardInfo.ASPECT_NAME, + op="add", + path=f"/datasets/{urn}", + value=urn, + ) + + return self + + def set_dashboard_url( + self, dashboard_url: Optional[str] + ) -> "DashboardPatchBuilder": + if dashboard_url: + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/dashboardUrl", + value=dashboard_url, + ) + + return self + + def set_access( + self, access: Union[None, Union[str, "AccessLevelClass"]] = None + ) -> "DashboardPatchBuilder": + if access: + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/access", + value=access, + ) + + return self + + def set_last_refreshed( + self, last_refreshed: Optional[int] + ) -> "DashboardPatchBuilder": + if last_refreshed: + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/lastRefreshed", + value=last_refreshed, + ) + + return self + + def set_last_modified( + self, last_modified: "ChangeAuditStampsClass" + ) -> "DashboardPatchBuilder": + if last_modified: + self._add_patch( + DashboardInfo.ASPECT_NAME, + "add", + path="/lastModified", + value=last_modified, + ) + + return self diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_admin_access_not_allowed.json b/metadata-ingestion/tests/integration/powerbi/golden_test_admin_access_not_allowed.json index 8ec431b6fe9f1b..049008bddc58ed 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_admin_access_not_allowed.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_admin_access_not_allowed.json @@ -312,6 +312,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_admin_only.json b/metadata-ingestion/tests/integration/powerbi/golden_test_admin_only.json index 29e9ccebf067ec..fa4bcb8abaa94d 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_admin_only.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_admin_only.json @@ -1226,6 +1226,40 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, @@ -1951,6 +1985,50 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,reports.5b218778-e7a5-4d73-8187-f10824047715)", + "changeType": "PATCH", + "aspectName": "dashboardInfo", + "aspect": { + "json": [ + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://app.powerbi.com/groups/f089354e-8366-4e18-aea3-4cb4a3a50b48/reports/5b218778-e7a5-4d73-8187-f10824047715" + }, + { + "op": "add", + "path": "/description", + "value": "Acryl sales marketing report" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "op": "add", + "path": "/title", + "value": "SalesMarketing" + } + ] + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,reports.5b218778-e7a5-4d73-8187-f10824047715)", diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json b/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json index 937cad0b9ec17e..60b36897ed2e42 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json @@ -1276,6 +1276,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_container.json b/metadata-ingestion/tests/integration/powerbi/golden_test_container.json index 501ec284097b39..b43e4a6c2c1c2d 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_container.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_container.json @@ -1721,6 +1721,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, @@ -2903,6 +2942,60 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,reports.5b218778-e7a5-4d73-8187-f10824047715)", + "changeType": "PATCH", + "aspectName": "dashboardInfo", + "aspect": { + "json": [ + { + "op": "add", + "path": "/title", + "value": "SalesMarketing" + }, + { + "op": "add", + "path": "/description", + "value": "Acryl sales marketing report" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,pages.5b218778-e7a5-4d73-8187-f10824047715.ReportSection)", + "value": "urn:li:chart:(powerbi,pages.5b218778-e7a5-4d73-8187-f10824047715.ReportSection)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,pages.5b218778-e7a5-4d73-8187-f10824047715.ReportSection1)", + "value": "urn:li:chart:(powerbi,pages.5b218778-e7a5-4d73-8187-f10824047715.ReportSection1)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://app.powerbi.com/groups/f089354e-8366-4e18-aea3-4cb4a3a50b48/reports/5b218778-e7a5-4d73-8187-f10824047715" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:33c7cab6ea0e58930cd6f943d0a4111e", @@ -3065,6 +3158,30 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C22-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard2" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json index 74779ac7a0577e..c5414444cc35b8 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_disabled_ownership.json @@ -986,6 +986,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_endorsement.json b/metadata-ingestion/tests/integration/powerbi/golden_test_endorsement.json index 442dfd5c8c0825..e1ddbfb901badd 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_endorsement.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_endorsement.json @@ -1158,6 +1158,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json index 2c4ff6ee851f4a..6f899a7fa11b72 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest.json @@ -1018,6 +1018,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_ingest_patch_disabled.json b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest_patch_disabled.json new file mode 100644 index 00000000000000..efbd9abfdb9118 --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_ingest_patch_disabled.json @@ -0,0 +1,1153 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "dummy", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "public issue_history", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Snowflake.Databases(\"hp123rt5.ap-southeast-2.fakecomputing.com\",\"PBI_TEST_WAREHOUSE_PROD\",[Role=\"PBI_TEST_MEMBER\"]),\n PBI_TEST_Database = Source{[Name=\"PBI_TEST\",Kind=\"Database\"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name=\"TESTTABLE\",Kind=\"Table\"]}[Data]\nin\n TESTTABLE_Table", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "SNOWFLAKE_TESTTABLE", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "snowflake native-query", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = GoogleBigQuery.Database([BillingProject = #\"Parameter - Source\"]),\n#\"gcp-project\" = Source{[Name=#\"Parameter - Source\"]}[Data],\nuniversal_Schema = #\"gcp-project\"{[Name=\"universal\",Kind=\"Schema\"]}[Data],\nD_WH_DATE_Table = universal_Schema{[Name=\"D_WH_DATE\",Kind=\"Table\"]}[Data],\n#\"Filtered Rows\" = Table.SelectRows(D_WH_DATE_Table, each [D_DATE] > #datetime(2019, 9, 10, 0, 0, 0)),\n#\"Filtered Rows1\" = Table.SelectRows(#\"Filtered Rows\", each DateTime.IsInPreviousNHours([D_DATE], 87600))\n in \n#\"Filtered Rows1\"", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "big-query-with-parameter", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]){[Name=\"GSL_TEST_DB\"]}[Data], \"select A.name from GSL_TEST_DB.PUBLIC.SALES_ANALYST as A inner join GSL_TEST_DB.PUBLIC.SALES_FORECAST as B on A.name = B.name where startswith(A.name, 'mo')\", null, [EnableFolding=true])\nin\n Source", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "snowflake native-query-with-join", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Oracle.Database(\"localhost:1521/salesdb.domain.com\", [HierarchicalNavigation=true]), HR = Source{[Schema=\"HR\"]}[Data], EMPLOYEES1 = HR{[Name=\"EMPLOYEES\"]}[Data] \n in EMPLOYEES1", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "job-history", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = PostgreSQL.Database(\"localhost\" , \"mics\" ),\n public_order_date = Source{[Schema=\"public\",Item=\"order_date\"]}[Data] \n in \n public_order_date", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "postgres_test_table", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Sql.Database(\"localhost\", \"library\"),\n dbo_book_issue = Source{[Schema=\"dbo\",Item=\"book_issue\"]}[Data]\n in dbo_book_issue", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details", + "name": "dbo_book_issue", + "description": "hr pbi test description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"mth_date\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([mth_date])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details", + "name": "ms_sql_native_table", + "description": "hr pbi test description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "json": { + "username": "User1@foo.com" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "json": { + "username": "User2@foo.com" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "json": { + "customProperties": { + "createdFrom": "Dataset", + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445", + "datasetWebUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details" + }, + "title": "test_tile", + "description": "test_tile", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Tile" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "chartId": "charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "demo-workspace" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "json": { + "customProperties": { + "createdFrom": "Dataset", + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed", + "datasetWebUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details" + }, + "title": "yearly_sales", + "description": "yearly_sales", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Tile" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "chartId": "charts.23212598-23b5-4980-87cc-5fc0ecd84385" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "demo-workspace" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "json": { + "customProperties": { + "chartCount": "2", + "workspaceName": "demo-workspace", + "workspaceId": "64ED5CAD-7C10-4684-8180-826122881108" + }, + "title": "test_dashboard", + "description": "Description of test dashboard", + "charts": [ + "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + ], + "datasets": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "dashboardUrl": "https://localhost/dashboards/web/1" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "dashboardId": "powerbi.linkedin.com/dashboards/7D668CAD-7FFC-4505-9215-655BCA5BEBAE" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:users.User1@foo.com", + "type": "NONE" + }, + { + "owner": "urn:li:corpuser:users.User2@foo.com", + "type": "NONE" + } + ], + "ownerTypes": {}, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "demo-workspace" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json index 0aea8514559ecb..9a09cb4fec64d0 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_lineage.json @@ -1201,6 +1201,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json index 22a00236af8c61..d80aa02c4cb123 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_lower_case_urn_ingest.json @@ -1018,6 +1018,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json b/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json index 11a7fed6030cec..66e87952bf1416 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_most_config_and_modified_since_admin_only.json @@ -1028,6 +1028,40 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, @@ -1257,6 +1291,50 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,reports.5b218778-e7a5-4d73-8187-f10824047715)", + "changeType": "PATCH", + "aspectName": "dashboardInfo", + "aspect": { + "json": [ + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://app.powerbi.com/groups/f089354e-8366-4e18-aea3-4cb4a3a50b48/reports/5b218778-e7a5-4d73-8187-f10824047715" + }, + { + "op": "add", + "path": "/description", + "value": "Acryl sales marketing report" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + { + "op": "add", + "path": "/title", + "value": "SalesMarketing" + } + ] + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "container", "entityUrn": "urn:li:container:977b804137a1d2bf897ff1bbf440a1cc", diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_platform_instance_ingest.json b/metadata-ingestion/tests/integration/powerbi/golden_test_platform_instance_ingest.json index cf5d4df460e231..ea1ee0df4b1057 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_platform_instance_ingest.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_platform_instance_ingest.json @@ -1026,6 +1026,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,aws-ap-south-1.charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,aws-ap-south-1.charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,aws-ap-south-1.charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,aws-ap-south-1.charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json index cce9e3f8755ba5..094869bfd24f1d 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_report.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_report.json @@ -1018,6 +1018,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, @@ -1935,6 +1974,60 @@ "lastRunId": "no-run-id-provided" } }, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,reports.5b218778-e7a5-4d73-8187-f10824047715)", + "changeType": "PATCH", + "aspectName": "dashboardInfo", + "aspect": { + "json": [ + { + "op": "add", + "path": "/title", + "value": "SalesMarketing" + }, + { + "op": "add", + "path": "/description", + "value": "Acryl sales marketing report" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,pages.5b218778-e7a5-4d73-8187-f10824047715.ReportSection)", + "value": "urn:li:chart:(powerbi,pages.5b218778-e7a5-4d73-8187-f10824047715.ReportSection)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,pages.5b218778-e7a5-4d73-8187-f10824047715.ReportSection1)", + "value": "urn:li:chart:(powerbi,pages.5b218778-e7a5-4d73-8187-f10824047715.ReportSection1)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://app.powerbi.com/groups/f089354e-8366-4e18-aea3-4cb4a3a50b48/reports/5b218778-e7a5-4d73-8187-f10824047715" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + } + ] + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, { "entityType": "dashboard", "entityUrn": "urn:li:dashboard:(powerbi,reports.5b218778-e7a5-4d73-8187-f10824047715)", diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json b/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json index 5e244e0e0f93f7..dcaa518a3c3237 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_scan_all_workspaces.json @@ -986,6 +986,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, @@ -1087,6 +1126,30 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C22-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard2" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_server_to_platform_instance.json b/metadata-ingestion/tests/integration/powerbi/golden_test_server_to_platform_instance.json index 8fd8989b81122b..bc5e844f679c71 100644 --- a/metadata-ingestion/tests/integration/powerbi/golden_test_server_to_platform_instance.json +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_server_to_platform_instance.json @@ -1226,6 +1226,45 @@ "op": "add", "path": "/customProperties/workspaceId", "value": "64ED5CAD-7C10-4684-8180-826122881108" + }, + { + "op": "add", + "path": "/title", + "value": "test_dashboard" + }, + { + "op": "add", + "path": "/description", + "value": "Description of test dashboard" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "value": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)" + }, + { + "op": "add", + "path": "/charts/urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "value": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + }, + { + "op": "add", + "path": "/dashboardUrl", + "value": "https://localhost/dashboards/web/1" + }, + { + "op": "add", + "path": "/lastModified", + "value": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } } ] }, diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 06cc40fe7b24c1..6a95ec2c1dda42 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -683,6 +683,52 @@ def test_powerbi_ingest( ) +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +@pytest.mark.integration +def test_powerbi_ingest_patch_disabled( + mock_msal: MagicMock, + pytestconfig: pytest.Config, + tmp_path: str, + mock_time: datetime.datetime, + requests_mock: Any, +) -> None: + enable_logging() + + test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" + + register_mock_api(request_mock=requests_mock) + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_source_config(), + "patch_metadata": False, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_mces.json", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "golden_test_ingest_patch_disabled.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=f"{tmp_path}/powerbi_mces.json", + golden_path=f"{test_resources_dir}/{golden_file}", + ) + + @freeze_time(FROZEN_TIME) @mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) @pytest.mark.integration diff --git a/metadata-ingestion/tests/unit/utilities/test_incremental_lineage_helper.py b/metadata-ingestion/tests/unit/utilities/test_incremental_lineage_helper.py new file mode 100644 index 00000000000000..1db4e48fce6088 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_incremental_lineage_helper.py @@ -0,0 +1,40 @@ +from typing import Optional + +from datahub.ingestion.api.incremental_lineage_helper import convert_chart_info_to_patch +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.schema_classes import ( + ChangeAuditStampsClass, + ChartInfoClass, + MetadataChangeProposalClass, +) + + +def test_convert_chart_info_to_patch(): + chart_info_class: ChartInfoClass = ChartInfoClass( + title="foo", + description="Checking patch", + inputs=[ + "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.analytics.pet_details,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.customers,PROD)", + ], + lastModified=ChangeAuditStampsClass(), + ) + + mw: Optional[MetadataWorkUnit] = convert_chart_info_to_patch( + urn="urn:li:chart:(looker,dashboard_elements.1)", + aspect=chart_info_class, + system_metadata=None, + ) + + assert mw + + assert mw.id == "urn:li:chart:(looker,dashboard_elements.1)-chartInfo" + + assert isinstance(mw.metadata, MetadataChangeProposalClass) + + assert mw.metadata.aspect + + assert ( + mw.metadata.aspect.value + == b'[{"op": "add", "path": "/title", "value": "foo"}, {"op": "add", "path": "/lastModified", "value": {"created": {"time": 0, "actor": "urn:li:corpuser:unknown"}, "lastModified": {"time": 0, "actor": "urn:li:corpuser:unknown"}}}, {"op": "add", "path": "/description", "value": "Checking patch"}, {"op": "add", "path": "/inputs/urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.analytics.pet_details,PROD)", "value": "urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.analytics.pet_details,PROD)"}, {"op": "add", "path": "/inputs/urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.customers,PROD)", "value": "urn:li:dataset:(urn:li:dataPlatform:dbt,calm-pagoda-323403.jaffle_shop.customers,PROD)"}]' + ) From e83550ba352037350c69082a843bf78578afd59c Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 31 Jul 2024 12:20:48 -0700 Subject: [PATCH 03/16] feat(ingest/tableau): add retry on timeout (#10995) --- metadata-ingestion/setup.py | 2 +- .../src/datahub/ingestion/api/source.py | 5 +- .../state/stale_entity_removal_handler.py | 2 +- .../src/datahub/ingestion/source/tableau.py | 48 ++++++++++++++++++- 4 files changed, 53 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 8a593b23d1f9cc..20a43a94f6bdaf 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -454,7 +454,7 @@ }, # FIXME: I don't think tableau uses sqllineage anymore so we should be able # to remove that dependency. - "tableau": {"tableauserverclient>=0.17.0"} | sqllineage_lib | sqlglot_lib, + "tableau": {"tableauserverclient>=0.24.0"} | sqllineage_lib | sqlglot_lib, "teradata": sql_common | usage_common | sqlglot_lib diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 788bec97a64884..a4de8b382430c7 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -137,7 +137,10 @@ def report_log( ) # Add the simple exception details to the context. - context = f"{context}: {exc}" + if context: + context = f"{context} {type(exc)}: {exc}" + else: + context = f"{type(exc)}: {exc}" elif log: logger.log(level=level.value, msg=log_content, stacklevel=stacklevel) diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py b/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py index ee1ccdff781dc1..7ba06fe24155d1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stale_entity_removal_handler.py @@ -298,7 +298,7 @@ def gen_removed_entity_workunits(self) -> Iterable[MetadataWorkUnit]: if copy_previous_state_and_fail: logger.info( - f"Copying urns from last state (size {last_checkpoint_state.urns}) to current state (size {cur_checkpoint_state.urns}) " + f"Copying urns from last state (size {len(last_checkpoint_state.urns)}) to current state (size {len(cur_checkpoint_state.urns)}) " "to ensure stale entities from previous runs are deleted on the next successful run." ) for urn in last_checkpoint_state.urns: diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 1655724f2d402d..9cde3b1f8d3a07 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -1,5 +1,6 @@ import logging import re +import time from collections import OrderedDict from dataclasses import dataclass from datetime import datetime @@ -13,6 +14,7 @@ Optional, Set, Tuple, + Type, Union, cast, ) @@ -158,6 +160,21 @@ from datahub.utilities import config_clean from datahub.utilities.urns.dataset_urn import DatasetUrn +try: + # On earlier versions of the tableauserverclient, the NonXMLResponseError + # was thrown when reauthentication was needed. We'll keep both exceptions + # around for now, but can remove this in the future. + from tableauserverclient.server.endpoint.exceptions import ( # type: ignore + NotSignedInError, + ) + + REAUTHENTICATE_ERRORS: Tuple[Type[Exception], ...] = ( + NotSignedInError, + NonXMLResponseError, + ) +except ImportError: + REAUTHENTICATE_ERRORS = (NonXMLResponseError,) + logger: logging.Logger = logging.getLogger(__name__) # Replace / with | @@ -965,7 +982,7 @@ def get_connection_object_page( query_data = query_metadata( self.server, query, connection_type, count, offset, query_filter ) - except NonXMLResponseError: + except REAUTHENTICATE_ERRORS: if not retry_on_auth_error: raise @@ -1038,6 +1055,35 @@ def get_connection_object_page( ) else: + # As of Tableau Server 2024.2, the metadata API sporadically returns a 30 second + # timeout error. It doesn't reliably happen, so retrying a couple times makes sense. + if all( + error.get("message") + == "Execution canceled because timeout of 30000 millis was reached" + for error in errors + ): + # If it was only a timeout error, we can retry. + if retries_remaining <= 0: + raise + + # This is a pretty dumb backoff mechanism, but it's good enough for now. + backoff_time = min( + (self.config.max_retries - retries_remaining + 1) ** 2, 60 + ) + logger.info( + f"Query {connection_type} received a 30 second timeout error - will retry in {backoff_time} seconds. " + f"Retries remaining: {retries_remaining}" + ) + time.sleep(backoff_time) + return self.get_connection_object_page( + query, + connection_type, + query_filter, + count, + offset, + retry_on_auth_error=False, + retries_remaining=retries_remaining - 1, + ) raise RuntimeError(f"Query {connection_type} error: {errors}") connection_object = query_data.get(c.DATA, {}).get(connection_type, {}) From fc7b6853b93649ffbb6d2df38f0ac1348220a2ec Mon Sep 17 00:00:00 2001 From: "jaegwon.seo" <162448493+wornjs@users.noreply.github.com> Date: Thu, 1 Aug 2024 05:05:55 +0900 Subject: [PATCH 04/16] change generate kafka connect properties from env (#10545) Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- docker/kafka-setup/env_to_properties.py | 24 +++++++++++++++ docker/kafka-setup/kafka-setup.sh | 40 +------------------------ 2 files changed, 25 insertions(+), 39 deletions(-) create mode 100644 docker/kafka-setup/env_to_properties.py diff --git a/docker/kafka-setup/env_to_properties.py b/docker/kafka-setup/env_to_properties.py new file mode 100644 index 00000000000000..8d8b8c3cc7b59f --- /dev/null +++ b/docker/kafka-setup/env_to_properties.py @@ -0,0 +1,24 @@ +import os +import re +import sys + + +def env_to_properties(env_prefix: str, properties_file: str): + pattern = re.compile('(?<=[^_])_(?=[^_])') + props = {} + + for (env_name, val) in os.environ.items(): + if env_name.startswith(env_prefix): + raw_name = env_name[len(env_prefix):].lower() + prop_dot = '.'.join(pattern.split(raw_name)) + props[prop_dot] = val + + with open(properties_file, 'a') as f: + for k, v in props.items(): + f.writelines(f'{k}={v}\n') + + +if __name__ == '__main__': + env_prefix = sys.argv[1] + properties_file = sys.argv[2] + env_to_properties(env_prefix, properties_file) diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh index 439ffb4d4d8295..392cca94666419 100755 --- a/docker/kafka-setup/kafka-setup.sh +++ b/docker/kafka-setup/kafka-setup.sh @@ -10,46 +10,8 @@ fi . kafka-config.sh echo "bootstrap.servers=$KAFKA_BOOTSTRAP_SERVER" > $CONNECTION_PROPERTIES_PATH -echo "security.protocol=$KAFKA_PROPERTIES_SECURITY_PROTOCOL" >> $CONNECTION_PROPERTIES_PATH -## Add support for SASL_PLAINTEXT -if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SASL_PLAINTEXT" ]]; then - echo "sasl.mechanism=$KAFKA_PROPERTIES_SASL_MECHANISM" >> $CONNECTION_PROPERTIES_PATH - echo "sasl.jaas.config=$KAFKA_PROPERTIES_SASL_JAAS_CONFIG" >> $CONNECTION_PROPERTIES_PATH - echo "sasl.kerberos.service.name=$KAFKA_PROPERTIES_SASL_KERBEROS_SERVICE_NAME" >> $CONNECTION_PROPERTIES_PATH -fi - -## Add support for SASL_SSL -if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SASL_SSL" ]]; then - echo "sasl.jaas.config=$KAFKA_PROPERTIES_SASL_JAAS_CONFIG" >> $CONNECTION_PROPERTIES_PATH - echo "sasl.mechanism=$KAFKA_PROPERTIES_SASL_MECHANISM" >> $CONNECTION_PROPERTIES_PATH -fi - -if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SSL" ]]; then - if [[ -n $KAFKA_PROPERTIES_SSL_KEYSTORE_LOCATION ]]; then - echo "ssl.keystore.location=$KAFKA_PROPERTIES_SSL_KEYSTORE_LOCATION" >> $CONNECTION_PROPERTIES_PATH - echo "ssl.keystore.password=$KAFKA_PROPERTIES_SSL_KEYSTORE_PASSWORD" >> $CONNECTION_PROPERTIES_PATH - echo "ssl.key.password=$KAFKA_PROPERTIES_SSL_KEY_PASSWORD" >> $CONNECTION_PROPERTIES_PATH - if [[ -n $KAFKA_PROPERTIES_SSL_KEYSTORE_TYPE ]]; then - echo "ssl.keystore.type=$KAFKA_PROPERTIES_SSL_KEYSTORE_TYPE" >> $CONNECTION_PROPERTIES_PATH - fi - fi - if [[ -n $KAFKA_PROPERTIES_SSL_TRUSTSTORE_LOCATION ]]; then - echo "ssl.truststore.location=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_LOCATION" >> $CONNECTION_PROPERTIES_PATH - if [[ $KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE != "PEM" ]]; then - echo "ssl.truststore.password=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_PASSWORD" >> $CONNECTION_PROPERTIES_PATH - fi - if [[ -n $KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE ]]; then - echo "ssl.truststore.type=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE" >> $CONNECTION_PROPERTIES_PATH - fi - fi - echo "ssl.endpoint.identification.algorithm=$KAFKA_PROPERTIES_SSL_ENDPOINT_IDENTIFICATION_ALGORITHM" >> $CONNECTION_PROPERTIES_PATH -fi - -# Add support for SASL_CLIENT_CALLBACK_HANDLER_CLASS -if [[ -n "$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" ]]; then - echo "sasl.client.callback.handler.class=$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" >> $CONNECTION_PROPERTIES_PATH -fi +python env_to_properties.py KAFKA_PROPERTIES_ $CONNECTION_PROPERTIES_PATH # cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180 . kafka-ready.sh From 2336207a2a6a22de582b2dc4488407668276da2b Mon Sep 17 00:00:00 2001 From: "Renan F. Lima" <51028757+lima-renan@users.noreply.github.com> Date: Wed, 31 Jul 2024 17:06:19 -0300 Subject: [PATCH 05/16] fix(ingest): fix oracle cronjob ingestion (#11001) Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- docker/datahub-ingestion-base/Dockerfile | 16 ++++++++-------- docker/datahub-ingestion/Dockerfile | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 383478b675640f..8a238c32704bb6 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -85,18 +85,18 @@ RUN apt-get update && apt-get install -y -qq \ RUN if [ $(arch) = "x86_64" ]; then \ mkdir /opt/oracle && \ cd /opt/oracle && \ - wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/216000/instantclient-basic-linux.x64-21.6.0.0.0dbru.zip && \ - unzip instantclient-basic-linux.x64-21.6.0.0.0dbru.zip && \ - rm instantclient-basic-linux.x64-21.6.0.0.0dbru.zip && \ - sh -c "echo /opt/oracle/instantclient_21_6 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ + wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/2115000/instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + unzip instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + rm instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \ + sh -c "echo /opt/oracle/instantclient_21_15 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ ldconfig; \ else \ mkdir /opt/oracle && \ cd /opt/oracle && \ - wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip && \ - unzip instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip && \ - rm instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip && \ - sh -c "echo /opt/oracle/instantclient_19_10 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ + wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/1923000/instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + unzip instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + rm instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \ + sh -c "echo /opt/oracle/instantclient_19_23 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \ ldconfig; \ fi; diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 068911695811f5..b8eda548491224 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=full ARG BASE_IMAGE=acryldata/datahub-ingestion-base -ARG DOCKER_VERSION=head +ARG DOCKER_VERSION=head-full ARG DEBIAN_REPO_URL=https://deb.debian.org/debian ARG PIP_MIRROR_URL=https://pypi.python.org/simple From b13d990f07d2a2d3c3042c79047958f8d288017b Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Wed, 31 Jul 2024 15:30:30 -0500 Subject: [PATCH 06/16] chore(ci): revert update deprecated github actions (#10977) (#11062) --- .github/workflows/airflow-plugin.yml | 4 ++-- .github/workflows/build-and-test.yml | 4 ++-- .github/workflows/dagster-plugin.yml | 4 ++-- .github/workflows/docker-unified.yml | 6 +++--- .github/workflows/metadata-ingestion.yml | 4 ++-- .github/workflows/metadata-io.yml | 4 ++-- .github/workflows/spark-smoke-test.yml | 4 ++-- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 114256ad825e56..d4f0a1369da253 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -74,7 +74,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && pip freeze - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: ${{ always() && matrix.python-version == '3.10' && matrix.extra_pip_requirements == 'apache-airflow>=2.7.0' }} with: name: Test Results (Airflow Plugin ${{ matrix.python-version}}) @@ -98,7 +98,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index d2116fc2fca788..c93267947b65a8 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -99,7 +99,7 @@ jobs: if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }} run: | ./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: always() with: name: Test Results (build) @@ -128,7 +128,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/dagster-plugin.yml b/.github/workflows/dagster-plugin.yml index 381a01aca82c34..48f1b24196c9e0 100644 --- a/.github/workflows/dagster-plugin.yml +++ b/.github/workflows/dagster-plugin.yml @@ -56,7 +56,7 @@ jobs: - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/dagster-plugin/venv/bin/activate && pip freeze - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }} with: name: Test Results (dagster Plugin ${{ matrix.python-version}}) @@ -79,7 +79,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 216f51e8ce970d..9487e71e8da3d1 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -1024,18 +1024,18 @@ jobs: docker logs datahub-datahub-frontend-react-1 >& frontend-${{ matrix.test_strategy }}.log || true docker logs datahub-upgrade-1 >& upgrade-${{ matrix.test_strategy }}.log || true - name: Upload logs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 if: failure() with: name: docker logs path: "*.log" - name: Upload screenshots - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 if: failure() with: name: cypress-snapshots-${{ matrix.test_strategy }} path: smoke-test/tests/cypress/cypress/screenshots/ - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: always() with: name: Test Results (smoke tests) ${{ matrix.test_strategy }} diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index ef84afd9c37793..51b97552eb150a 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -83,7 +83,7 @@ jobs: df -hl docker image ls docker system df - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 with: name: Test Results (metadata ingestion ${{ matrix.python-version }}) path: | @@ -106,7 +106,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index 4b1e878ea25261..6797c7ad67c0b6 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -62,7 +62,7 @@ jobs: - name: Gradle build (and test) run: | ./gradlew :metadata-io:test - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: always() with: name: Test Results (metadata-io) @@ -78,7 +78,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Upload - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 with: name: Event File path: ${{ github.event_path }} diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index 46f6e95454477f..8ffc8420ba9413 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -69,14 +69,14 @@ jobs: docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true - name: Upload logs - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v3 if: failure() with: name: docker logs path: | "**/build/container-logs/*.log" "*.log" - - uses: actions/upload-artifact@v4 + - uses: actions/upload-artifact@v3 if: always() with: name: Test Results (smoke tests) From 89933fee1e141068fac60c3639eb8b2fa5b43871 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Wed, 31 Jul 2024 14:16:18 -0700 Subject: [PATCH 07/16] feat(ingest/dbt-cloud): update metadata_endpoint inference (#11041) --- metadata-ingestion/src/datahub/cli/get_cli.py | 1 + .../datahub/ingestion/source/dbt/dbt_cloud.py | 37 ++++++++++++++++--- .../tests/unit/test_dbt_source.py | 25 +++++++++++-- 3 files changed, 53 insertions(+), 10 deletions(-) diff --git a/metadata-ingestion/src/datahub/cli/get_cli.py b/metadata-ingestion/src/datahub/cli/get_cli.py index b6ff5f39a2c14b..27fa987ac79779 100644 --- a/metadata-ingestion/src/datahub/cli/get_cli.py +++ b/metadata-ingestion/src/datahub/cli/get_cli.py @@ -56,6 +56,7 @@ def urn(ctx: Any, urn: Optional[str], aspect: List[str], details: bool) -> None: entity_urn=urn, aspects=aspect, typed=False, + details=details, ), sort_keys=True, indent=2, diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index 8a99f096b51676..0672b9ce6f781c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -40,8 +40,7 @@ class DBTCloudConfig(DBTCommonConfig): metadata_endpoint: str = Field( default="https://metadata.cloud.getdbt.com/graphql", - description="The dbt Cloud metadata API endpoint. This is deprecated, and will be removed in a future release. Please use access_url instead.", - deprecated=True, + description="The dbt Cloud metadata API endpoint. If not provided, we will try to infer it from the access_url.", ) token: str = Field( @@ -66,13 +65,39 @@ class DBTCloudConfig(DBTCommonConfig): @root_validator(pre=True) def set_metadata_endpoint(cls, values: dict) -> dict: if values.get("access_url") and not values.get("metadata_endpoint"): - parsed_uri = urlparse(values["access_url"]) - values[ - "metadata_endpoint" - ] = f"{parsed_uri.scheme}://metadata.{parsed_uri.netloc}/graphql" + metadata_endpoint = infer_metadata_endpoint(values["access_url"]) + if metadata_endpoint is None: + raise ValueError( + "Unable to infer the metadata endpoint from the access URL. Please provide a metadata endpoint." + ) + values["metadata_endpoint"] = metadata_endpoint return values +def infer_metadata_endpoint(access_url: str) -> Optional[str]: + # See https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses#api-access-urls + # and https://docs.getdbt.com/docs/dbt-cloud-apis/discovery-querying#discovery-api-endpoints + + try: + parsed_uri = urlparse(access_url) + assert parsed_uri.scheme is not None + assert parsed_uri.hostname is not None + except Exception as e: + logger.debug(f"Unable to parse access URL {access_url}: {e}", exc_info=e) + return None + + if parsed_uri.hostname.endswith(".dbt.com"): + # For cell-based deployments. + # prefix.region.dbt.com -> prefix.metadata.region.dbt.com + hostname_parts = parsed_uri.hostname.split(".", maxsplit=1) + return f"{parsed_uri.scheme}://{hostname_parts[0]}.metadata.{hostname_parts[1]}/graphql" + elif parsed_uri.hostname.endswith(".getdbt.com"): + return f"{parsed_uri.scheme}://metadata.{parsed_uri.netloc}/graphql" + else: + # The self-hosted variants also have the metadata. prefix. + return f"{parsed_uri.scheme}://metadata.{parsed_uri.netloc}/graphql" + + _DBT_GRAPHQL_COMMON_FIELDS = """ runId accountId diff --git a/metadata-ingestion/tests/unit/test_dbt_source.py b/metadata-ingestion/tests/unit/test_dbt_source.py index 48a6fd0f650685..01d7a4809b01b8 100644 --- a/metadata-ingestion/tests/unit/test_dbt_source.py +++ b/metadata-ingestion/tests/unit/test_dbt_source.py @@ -7,7 +7,10 @@ from datahub.emitter import mce_builder from datahub.ingestion.api.common import PipelineContext -from datahub.ingestion.source.dbt.dbt_cloud import DBTCloudConfig +from datahub.ingestion.source.dbt.dbt_cloud import ( + DBTCloudConfig, + infer_metadata_endpoint, +) from datahub.ingestion.source.dbt.dbt_core import ( DBTCoreConfig, DBTCoreSource, @@ -366,7 +369,7 @@ def test_dbt_entity_emission_configuration_helpers(): def test_dbt_cloud_config_access_url(): config_dict = { - "access_url": "https://my-dbt-cloud.dbt.com", + "access_url": "https://emea.getdbt.com", "token": "dummy_token", "account_id": "123456", "project_id": "1234567", @@ -375,8 +378,8 @@ def test_dbt_cloud_config_access_url(): "target_platform": "dummy_platform", } config = DBTCloudConfig.parse_obj(config_dict) - assert config.access_url == "https://my-dbt-cloud.dbt.com" - assert config.metadata_endpoint == "https://metadata.my-dbt-cloud.dbt.com/graphql" + assert config.access_url == "https://emea.getdbt.com" + assert config.metadata_endpoint == "https://metadata.emea.getdbt.com/graphql" def test_dbt_cloud_config_with_defined_metadata_endpoint(): @@ -398,6 +401,20 @@ def test_dbt_cloud_config_with_defined_metadata_endpoint(): ) +def test_infer_metadata_endpoint() -> None: + assert ( + infer_metadata_endpoint("https://cloud.getdbt.com") + == "https://metadata.cloud.getdbt.com/graphql" + ) + assert ( + infer_metadata_endpoint("https://prefix.us1.dbt.com") + == "https://prefix.metadata.us1.dbt.com/graphql" + ) + assert ( + infer_metadata_endpoint("http://dbt.corp.internal") + ) == "http://metadata.dbt.corp.internal/graphql" + + def test_dbt_time_parsing() -> None: time_formats = [ "2024-03-28T05:56:15.236210Z", From 2ab43f393e8f9835b01845930055738e27d61614 Mon Sep 17 00:00:00 2001 From: Matt Exchange Date: Thu, 1 Aug 2024 03:19:33 +0100 Subject: [PATCH 08/16] build: Reduce size of datahub-frontend-react image by 50-ish% (#10878) Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com> --- docker/datahub-frontend/Dockerfile | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/docker/datahub-frontend/Dockerfile b/docker/datahub-frontend/Dockerfile index a828f1d8c27ad5..2a9354cbf6a04f 100644 --- a/docker/datahub-frontend/Dockerfile +++ b/docker/datahub-frontend/Dockerfile @@ -25,24 +25,25 @@ RUN apk --no-cache --update-cache --available upgrade \ ENV LD_LIBRARY_PATH="/lib:/lib64" -FROM base as prod-install +FROM base as unpack COPY ./datahub-frontend.zip / -RUN unzip datahub-frontend.zip -d /datahub-frontend \ - && mv /datahub-frontend/main/* /datahub-frontend \ - && rmdir /datahub-frontend/main \ - && rm datahub-frontend.zip +RUN unzip datahub-frontend.zip -d /tmp/out \ + && mv /tmp/out/main /datahub-frontend COPY ./docker/monitoring/client-prometheus-config.yaml /datahub-frontend/ RUN chown -R datahub:datahub /datahub-frontend && chmod 755 /datahub-frontend +FROM base as prod-install + +COPY --from=unpack /datahub-frontend/ /datahub-frontend/ + FROM base as dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. # See this excellent thread https://github.com/docker/cli/issues/1134 VOLUME [ "/datahub-frontend" ] FROM ${APP_ENV}-install as final -COPY ./docker/datahub-frontend/start.sh / -RUN chown datahub:datahub /start.sh && chmod 755 /start.sh +COPY --chown=datahub:datahub --chmod=755 ./docker/datahub-frontend/start.sh / USER datahub ARG SERVER_PORT=9002 From 1e6065ea842a763aa3d911ca6f32ff05290ed058 Mon Sep 17 00:00:00 2001 From: Pedro Silva Date: Thu, 1 Aug 2024 09:11:01 +0100 Subject: [PATCH 09/16] fix(ci): Fix lint issue in datahub_ingestion_run_summary_provider.py (#11063) --- .../reporting/datahub_ingestion_run_summary_provider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py index a175870cd9fbea..33bfb63feb3fd7 100644 --- a/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py +++ b/metadata-ingestion/src/datahub/ingestion/reporting/datahub_ingestion_run_summary_provider.py @@ -31,7 +31,6 @@ from datahub.utilities.logging_manager import get_log_buffer from datahub.utilities.urns.urn import Urn - logger = logging.getLogger(__name__) @@ -213,7 +212,7 @@ def on_completion( durationMs=self.get_cur_time_in_ms() - self.start_time_ms, # Truncate summary such that the generated MCP will not exceed GMS's payload limit. # Hardcoding the overall size of dataHubExecutionRequestResult to >1MB by trimming summary to 800,000 chars - report=summary[-self._MAX_SUMMARY_SIZE:], + report=summary[-self._MAX_SUMMARY_SIZE :], structuredReport=structured_report, ) From e9a0e2715cbf0b1306488071b898124f43f07f84 Mon Sep 17 00:00:00 2001 From: Jonny Dixon <45681293+acrylJonny@users.noreply.github.com> Date: Thu, 1 Aug 2024 09:18:17 +0100 Subject: [PATCH 10/16] docs(ingest): update developing-a-transformer.md (#11019) --- docs/actions/guides/developing-a-transformer.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/actions/guides/developing-a-transformer.md b/docs/actions/guides/developing-a-transformer.md index a843dbc846cd51..6406cdfae6104b 100644 --- a/docs/actions/guides/developing-a-transformer.md +++ b/docs/actions/guides/developing-a-transformer.md @@ -23,7 +23,7 @@ print the configuration that is provided when it is created, and print any Event ```python # custom_transformer.py from datahub_actions.transform.transformer import Transformer -from datahub_actions.event.event import EventEnvelope +from datahub_actions.event.event_envelope import EventEnvelope from datahub_actions.pipeline.pipeline_context import PipelineContext from typing import Optional @@ -75,7 +75,7 @@ Next, install the package pip install -e . ``` -inside the module. (alt.`python setup.py`). +inside the module. (alt.`python setup.py`). Once we have done this, our class will be referencable via `custom_transformer_example.custom_transformer:CustomTransformer`. @@ -96,7 +96,7 @@ source: connection: bootstrap: ${KAFKA_BOOTSTRAP_SERVER:-localhost:9092} schema_registry_url: ${SCHEMA_REGISTRY_URL:-http://localhost:8081} -transform: +transform: - type: "custom_transformer_example.custom_transformer:CustomTransformer" config: # Some sample configuration which should be printed on create. @@ -130,4 +130,4 @@ it without defining the full module path. Prerequisites to consideration for inclusion in the core Transformer library include - **Testing** Define unit tests for your Transformer -- **Deduplication** Confirm that no existing Transformer serves the same purpose, or can be easily extended to serve the same purpose \ No newline at end of file +- **Deduplication** Confirm that no existing Transformer serves the same purpose, or can be easily extended to serve the same purpose From c83907fdbc4c7444795b8e0b6262d02210e6e451 Mon Sep 17 00:00:00 2001 From: david-leifker <114954101+david-leifker@users.noreply.github.com> Date: Thu, 1 Aug 2024 03:18:30 -0500 Subject: [PATCH 11/16] feat(search-test): update search tests from #10408 (#11056) --- .../metadata/graph/GraphServiceTestBase.java | 81 ++++++++++--------- .../graph/neo4j/Neo4jGraphServiceTest.java | 14 ---- .../search/SearchGraphServiceTestBase.java | 50 ++++-------- .../search/LineageServiceTestBase.java | 1 + .../search/SearchServiceTestBase.java | 1 + .../metadata/search/TestEntityTestBase.java | 2 + .../indexbuilder/IndexBuilderTestBase.java | 8 +- .../SystemMetadataServiceTestBase.java | 2 + .../TimeseriesAspectServiceTestBase.java | 2 +- .../SearchLineageFixtureConfiguration.java | 7 +- .../search/BulkProcessorProxyListener.java | 44 ++++++++++ .../test/search/BulkProcessorTestUtils.java | 67 +++++++++++++++ .../test/search/SearchTestUtils.java | 20 ++++- .../SearchTestContainerConfiguration.java | 29 ++++--- .../search/GraphQueryConfiguration.java | 11 --- 15 files changed, 218 insertions(+), 121 deletions(-) create mode 100644 metadata-io/src/test/java/io/datahubproject/test/search/BulkProcessorProxyListener.java create mode 100644 metadata-io/src/test/java/io/datahubproject/test/search/BulkProcessorTestUtils.java diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java index 1aebc48153bbe5..b430313f5904b3 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/GraphServiceTestBase.java @@ -3,6 +3,7 @@ import static com.linkedin.metadata.search.utils.QueryUtils.EMPTY_FILTER; import static com.linkedin.metadata.search.utils.QueryUtils.newFilter; import static com.linkedin.metadata.search.utils.QueryUtils.newRelationshipFilter; +import static io.datahubproject.test.search.SearchTestUtils.getGraphQueryConfiguration; import static org.testng.Assert.*; import com.google.common.collect.ImmutableList; @@ -272,6 +273,8 @@ public int compare(RelatedEntity left, RelatedEntity right) { /** Any source and destination type value. */ protected static @Nullable List anyType = null; + protected final GraphQueryConfiguration _graphQueryConfiguration = getGraphQueryConfiguration(); + /** Timeout used to test concurrent ops in doTestConcurrentOp. */ protected Duration getTestConcurrentOpTimeout() { return Duration.ofMinutes(1); @@ -378,8 +381,7 @@ protected GraphService getPopulatedGraphService() throws Exception { } protected GraphService getLineagePopulatedGraphService() throws Exception { - return getLineagePopulatedGraphService( - GraphQueryConfiguration.testDefaults.isEnableMultiPathSearch()); + return getLineagePopulatedGraphService(_graphQueryConfiguration.isEnableMultiPathSearch()); } protected GraphService getLineagePopulatedGraphService(boolean multiPathSearch) throws Exception { @@ -1896,15 +1898,24 @@ public void testConcurrentAddEdge() throws Exception { allRelationships, outgoingRelationships, 0, - nodes * relationshipTypes * 2); + edges.size()); - Set expectedRelatedEntities = - edges.stream() - .map( - edge -> - new RelatedEntity(edge.getRelationshipType(), edge.getDestination().toString())) - .collect(Collectors.toSet()); - assertEquals(new HashSet<>(relatedEntities.entities), expectedRelatedEntities); + Set expectedRelatedEntities = convertEdgesToRelatedEntities(edges); + assertEquals( + deduplicateRelatedEntitiesByRelationshipTypeAndDestination(relatedEntities), + expectedRelatedEntities); + } + + protected Set convertEdgesToRelatedEntities(List edges) { + return edges.stream() + .map( + edge -> new RelatedEntity(edge.getRelationshipType(), edge.getDestination().toString())) + .collect(Collectors.toSet()); + } + + protected Set deduplicateRelatedEntitiesByRelationshipTypeAndDestination( + RelatedEntitiesResult relatedEntitiesResult) { + return Set.copyOf(relatedEntitiesResult.getEntities()); } @Test @@ -1933,8 +1944,10 @@ public void testConcurrentRemoveEdgesFromNode() throws Exception { allRelationships, outgoingRelationships, 0, - nodes * relationshipTypes * 2); - assertEquals(relatedEntities.entities.size(), nodes * relationshipTypes); + edges.size()); + assertEquals( + deduplicateRelatedEntitiesByRelationshipTypeAndDestination(relatedEntities).size(), + nodes * relationshipTypes); // delete all edges concurrently Stream operations = @@ -1992,8 +2005,10 @@ public void testConcurrentRemoveNodes() throws Exception { allRelationships, outgoingRelationships, 0, - nodes * relationshipTypes * 2); - assertEquals(relatedEntities.entities.size(), nodes * relationshipTypes); + edges.size()); + assertEquals( + deduplicateRelatedEntitiesByRelationshipTypeAndDestination(relatedEntities).size(), + nodes * relationshipTypes); // remove all nodes concurrently // nodes will be removed multiple times @@ -2138,30 +2153,20 @@ public void testHighlyConnectedGraphWalk() throws Exception { doTestConcurrentOp(operations); syncAfterWrite(); - Set expectedRelatedEntities = - edges.stream() - .map( - edge -> - new RelatedEntity(edge.getRelationshipType(), edge.getDestination().toString())) - .collect(Collectors.toSet()); - RelatedEntitiesResult relatedEntities = null; - for (int i = 0; i < 3; i++) { - relatedEntities = - service.findRelatedEntities( - null, - EMPTY_FILTER, - null, - EMPTY_FILTER, - allRelationships, - outgoingRelationships, - 0, - 400); - if (!new HashSet<>(relatedEntities.getEntities()).equals(expectedRelatedEntities)) { - // Sleep up to 6 seconds in case Elastic needs to catch up - Thread.sleep(2000); - } - } - assertEquals(new HashSet<>(relatedEntities.getEntities()), expectedRelatedEntities); + Set expectedRelatedEntities = convertEdgesToRelatedEntities(edges); + RelatedEntitiesResult relatedEntities = + service.findRelatedEntities( + null, + EMPTY_FILTER, + null, + EMPTY_FILTER, + allRelationships, + outgoingRelationships, + 0, + edges.size()); + assertEquals( + deduplicateRelatedEntitiesByRelationshipTypeAndDestination(relatedEntities), + expectedRelatedEntities); Urn root = dataset1Urn; EntityLineageResult lineageResult = diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java index 08c19bf8f52887..7513feb30d496b 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/neo4j/Neo4jGraphServiceTest.java @@ -169,20 +169,6 @@ public void testConcurrentAddEdge() { "Neo4jGraphService does not manage to add all edges added concurrently"); } - @Test - @Override - public void testConcurrentRemoveEdgesFromNode() { - // https://github.com/datahub-project/datahub/issues/3118 - throw new SkipException("Neo4jGraphService produces duplicates"); - } - - @Test - @Override - public void testConcurrentRemoveNodes() { - // https://github.com/datahub-project/datahub/issues/3118 - throw new SkipException("Neo4jGraphService produces duplicates"); - } - @Test public void testRemoveEdge() throws Exception { DatasetUrn datasetUrn = diff --git a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java index b4ad5ce61d8f4e..06f1369ff0670c 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/graph/search/SearchGraphServiceTestBase.java @@ -12,7 +12,6 @@ import com.linkedin.data.template.SetMode; import com.linkedin.metadata.aspect.models.graph.Edge; import com.linkedin.metadata.aspect.models.graph.RelatedEntity; -import com.linkedin.metadata.config.search.GraphQueryConfiguration; import com.linkedin.metadata.graph.EntityLineageResult; import com.linkedin.metadata.graph.GraphService; import com.linkedin.metadata.graph.GraphServiceTestBase; @@ -41,6 +40,8 @@ import java.util.Comparator; import java.util.HashSet; import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; import javax.annotation.Nonnull; import javax.annotation.Nullable; import org.junit.Assert; @@ -64,19 +65,18 @@ public abstract class SearchGraphServiceTestBase extends GraphServiceTestBase { private final IndexConvention _indexConvention = IndexConventionImpl.NO_PREFIX; private final String _indexName = _indexConvention.getIndexName(INDEX_NAME); private ElasticSearchGraphService _client; - private boolean _enableMultiPathSearch = - GraphQueryConfiguration.testDefaults.isEnableMultiPathSearch(); private static final String TAG_RELATIONSHIP = "SchemaFieldTaggedWith"; @BeforeClass public void setup() { - _client = buildService(_enableMultiPathSearch); + _client = buildService(_graphQueryConfiguration.isEnableMultiPathSearch()); _client.reindexAll(Collections.emptySet()); } @BeforeMethod public void wipe() throws Exception { + syncAfterWrite(); _client.clear(); syncAfterWrite(); } @@ -97,14 +97,10 @@ private ElasticSearchGraphService buildService(boolean enableMultiPathSearch) { } catch (EntityRegistryException e) { throw new RuntimeException(e); } - GraphQueryConfiguration configuration = GraphQueryConfiguration.testDefaults; - configuration.setEnableMultiPathSearch(enableMultiPathSearch); + _graphQueryConfiguration.setEnableMultiPathSearch(enableMultiPathSearch); ESGraphQueryDAO readDAO = new ESGraphQueryDAO( - getSearchClient(), - lineageRegistry, - _indexConvention, - GraphQueryConfiguration.testDefaults); + getSearchClient(), lineageRegistry, _indexConvention, _graphQueryConfiguration); ESGraphWriteDAO writeDAO = new ESGraphWriteDAO(_indexConvention, getBulkProcessor(), 1); return new ElasticSearchGraphService( lineageRegistry, @@ -118,8 +114,7 @@ private ElasticSearchGraphService buildService(boolean enableMultiPathSearch) { @Override @Nonnull protected GraphService getGraphService(boolean enableMultiPathSearch) { - if (enableMultiPathSearch != _enableMultiPathSearch) { - _enableMultiPathSearch = enableMultiPathSearch; + if (enableMultiPathSearch != _graphQueryConfiguration.isEnableMultiPathSearch()) { _client = buildService(enableMultiPathSearch); _client.reindexAll(Collections.emptySet()); } @@ -129,7 +124,7 @@ protected GraphService getGraphService(boolean enableMultiPathSearch) { @Override @Nonnull protected GraphService getGraphService() { - return getGraphService(GraphQueryConfiguration.testDefaults.isEnableMultiPathSearch()); + return getGraphService(_graphQueryConfiguration.isEnableMultiPathSearch()); } @Override @@ -305,26 +300,15 @@ public void testRemoveEdge() throws Exception { assertEquals(result.getTotal(), 0); } - @Test - @Override - public void testConcurrentAddEdge() { - // https://github.com/datahub-project/datahub/issues/3124 - throw new SkipException( - "This test is flaky for ElasticSearchGraphService, ~5% of the runs fail on a race condition"); - } - - @Test - @Override - public void testConcurrentRemoveEdgesFromNode() { - // https://github.com/datahub-project/datahub/issues/3118 - throw new SkipException("ElasticSearchGraphService produces duplicates"); - } - - @Test - @Override - public void testConcurrentRemoveNodes() { - // https://github.com/datahub-project/datahub/issues/3118 - throw new SkipException("ElasticSearchGraphService produces duplicates"); + // ElasticSearchGraphService produces duplicates + // https://github.com/datahub-project/datahub/issues/3118 + protected Set deduplicateRelatedEntitiesByRelationshipTypeAndDestination( + RelatedEntitiesResult relatedEntitiesResult) { + return relatedEntitiesResult.getEntities().stream() + .map( + relatedEntity -> + new RelatedEntity(relatedEntity.getRelationshipType(), relatedEntity.getUrn())) + .collect(Collectors.toSet()); } @Test diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java index 3dbbfb2cebc3f3..a9d84ae1f3aea1 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java @@ -162,6 +162,7 @@ private void resetService(boolean withCache, boolean withLightingCache) { @BeforeMethod public void wipe() throws Exception { + syncAfterWrite(getBulkProcessor()); elasticSearchService.clear(operationContext); clearCache(false); syncAfterWrite(getBulkProcessor()); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java index a610cf95f827ae..445b71b2eaff62 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/SearchServiceTestBase.java @@ -108,6 +108,7 @@ private void resetSearchService() { @BeforeMethod public void wipe() throws Exception { + syncAfterWrite(getBulkProcessor()); elasticSearchService.clear(operationContext); syncAfterWrite(getBulkProcessor()); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java index 58574025aeeac3..ab5e90f77c21aa 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/TestEntityTestBase.java @@ -70,7 +70,9 @@ public void setup() { @BeforeMethod public void wipe() throws Exception { + syncAfterWrite(getBulkProcessor()); elasticSearchService.clear(opContext); + syncAfterWrite(getBulkProcessor()); } @Nonnull diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/IndexBuilderTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/IndexBuilderTestBase.java index 92ca4c5ed8a05e..f639e5c5fd3937 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/IndexBuilderTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/indexbuilder/IndexBuilderTestBase.java @@ -39,9 +39,9 @@ public abstract class IndexBuilderTestBase extends AbstractTestNGSpringContextTe @Nonnull protected abstract RestHighLevelClient getSearchClient(); - private static IndicesClient _indexClient; + private IndicesClient _indexClient; private static final String TEST_INDEX_NAME = "esindex_builder_test"; - private static ESIndexBuilder testDefaultBuilder; + private ESIndexBuilder testDefaultBuilder; @BeforeClass public void setup() { @@ -63,7 +63,7 @@ public void setup() { } @BeforeMethod - public static void wipe() throws Exception { + public void wipe() throws Exception { try { _indexClient .getAlias(new GetAliasesRequest(TEST_INDEX_NAME), RequestOptions.DEFAULT) @@ -86,7 +86,7 @@ public static void wipe() throws Exception { } } - public static GetIndexResponse getTestIndex() throws IOException { + public GetIndexResponse getTestIndex() throws IOException { return _indexClient.get( new GetIndexRequest(TEST_INDEX_NAME).includeDefaults(true), RequestOptions.DEFAULT); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java index 7067dd3a6763e7..d843191bed7413 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/systemmetadata/SystemMetadataServiceTestBase.java @@ -44,7 +44,9 @@ public void setup() { @BeforeMethod public void wipe() throws Exception { + syncAfterWrite(getBulkProcessor()); _client.clear(); + syncAfterWrite(getBulkProcessor()); } @Nonnull diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java index b44f01d90dae40..10c6f09cb8f8d6 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java @@ -1291,7 +1291,7 @@ public void testCountByFilter() { @Test( groups = {"testCountAfterDelete"}, dependsOnGroups = {"deleteAspectValues1"}) - public void testCountByFilterAfterDelete() throws InterruptedException { + public void testCountByFilterAfterDelete() throws Exception { syncAfterWrite(getBulkProcessor()); // Test with filter Criterion hasUrnCriterion = diff --git a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java index 4cd818db34bf4b..e783c011de6d0e 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java +++ b/metadata-io/src/test/java/io/datahubproject/test/fixtures/search/SearchLineageFixtureConfiguration.java @@ -1,6 +1,7 @@ package io.datahubproject.test.fixtures.search; import static com.linkedin.metadata.Constants.*; +import static io.datahubproject.test.search.SearchTestUtils.getGraphQueryConfiguration; import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.client.JavaEntityClient; @@ -8,7 +9,6 @@ import com.linkedin.metadata.config.cache.EntityDocCountCacheConfiguration; import com.linkedin.metadata.config.cache.SearchLineageCacheConfiguration; import com.linkedin.metadata.config.search.ElasticSearchConfiguration; -import com.linkedin.metadata.config.search.GraphQueryConfiguration; import com.linkedin.metadata.config.search.SearchConfiguration; import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration; import com.linkedin.metadata.entity.EntityServiceImpl; @@ -172,10 +172,7 @@ protected ElasticSearchGraphService graphService( indexConvention, new ESGraphWriteDAO(indexConvention, bulkProcessor, 1), new ESGraphQueryDAO( - searchClient, - lineageRegistry, - indexConvention, - GraphQueryConfiguration.testDefaults), + searchClient, lineageRegistry, indexConvention, getGraphQueryConfiguration()), indexBuilder); graphService.reindexAll(Collections.emptySet()); return graphService; diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/BulkProcessorProxyListener.java b/metadata-io/src/test/java/io/datahubproject/test/search/BulkProcessorProxyListener.java new file mode 100644 index 00000000000000..a409a1e97ad909 --- /dev/null +++ b/metadata-io/src/test/java/io/datahubproject/test/search/BulkProcessorProxyListener.java @@ -0,0 +1,44 @@ +package io.datahubproject.test.search; + +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import org.opensearch.action.bulk.BulkProcessor; +import org.opensearch.action.bulk.BulkRequest; +import org.opensearch.action.bulk.BulkResponse; + +public class BulkProcessorProxyListener implements BulkProcessor.Listener { + private final BulkProcessor.Listener listener; + private final AtomicInteger unsentItemsCounter = new AtomicInteger(); + + public BulkProcessorProxyListener(BulkProcessor.Listener listener) { + this.listener = listener; + } + + @Override + public void beforeBulk(long l, BulkRequest bulkRequest) { + unsentItemsCounter.addAndGet(bulkRequest.numberOfActions()); + listener.beforeBulk(l, bulkRequest); + } + + @Override + public void afterBulk(long l, BulkRequest bulkRequest, BulkResponse bulkResponse) { + unsentItemsCounter.addAndGet(-bulkResponse.getItems().length); + listener.afterBulk(l, bulkRequest, bulkResponse); + } + + @Override + public void afterBulk(long l, BulkRequest bulkRequest, Throwable throwable) { + listener.afterBulk(l, bulkRequest, throwable); + } + + void waitForBulkProcessed() throws InterruptedException { + for (int i = 0; i < 6000; i++) { + if (unsentItemsCounter.get() == 0) { + break; + } + TimeUnit.MILLISECONDS.sleep(5); + } + // reset the counter just in case + unsentItemsCounter.set(0); + } +} diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/BulkProcessorTestUtils.java b/metadata-io/src/test/java/io/datahubproject/test/search/BulkProcessorTestUtils.java new file mode 100644 index 00000000000000..416a5d40bb0e3f --- /dev/null +++ b/metadata-io/src/test/java/io/datahubproject/test/search/BulkProcessorTestUtils.java @@ -0,0 +1,67 @@ +package io.datahubproject.test.search; + +import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; +import java.io.IOException; +import java.util.concurrent.TimeUnit; +import org.opensearch.action.admin.cluster.node.tasks.list.ListTasksRequest; +import org.opensearch.action.admin.indices.refresh.RefreshRequest; +import org.opensearch.action.bulk.BulkProcessor; +import org.opensearch.client.RequestOptions; +import org.opensearch.client.RestHighLevelClient; +import org.opensearch.common.unit.TimeValue; +import org.springframework.test.util.ReflectionTestUtils; + +public class BulkProcessorTestUtils { + private BulkProcessorTestUtils() {} + + public static void syncAfterWrite(ESBulkProcessor bulkProcessor) + throws InterruptedException, IOException { + bulkProcessor.flush(); + final RestHighLevelClient searchClient = getRestHighLevelClient(bulkProcessor); + // if the bulks are big it takes time for Elastic/OpenSearch to process these bulk requests + getBulkProcessorListener(bulkProcessor).waitForBulkProcessed(); + waitForCompletion(searchClient); + // some tasks might have refresh = false, so we need to refresh manually + searchClient.indices().refresh(new RefreshRequest(), RequestOptions.DEFAULT); + waitForCompletion(searchClient); + } + + private static void waitForCompletion(RestHighLevelClient searchClient) + throws IOException, InterruptedException { + while (!searchClient + .tasks() + .list( + new ListTasksRequest() + .setActions("indices:*,*/put,*/update") + .setWaitForCompletion(true) + .setTimeout(TimeValue.timeValueSeconds(30)), + RequestOptions.DEFAULT) + .getTasks() + .isEmpty()) { + // Mostly this is not reached, but in some rare cases it might + TimeUnit.MILLISECONDS.sleep(5); + } + } + + private static RestHighLevelClient getRestHighLevelClient(ESBulkProcessor esBulkProcessor) { + return (RestHighLevelClient) ReflectionTestUtils.getField(esBulkProcessor, "searchClient"); + } + + private static BulkProcessorProxyListener getBulkProcessorListener( + ESBulkProcessor esBulkProcessor) { + var bulkProcessor = ReflectionTestUtils.getField(esBulkProcessor, "bulkProcessor"); + var bulkRequestHandler = ReflectionTestUtils.getField(bulkProcessor, "bulkRequestHandler"); + return (BulkProcessorProxyListener) + ReflectionTestUtils.getField(bulkRequestHandler, "listener"); + } + + public static void replaceBulkProcessorListener(ESBulkProcessor esBulkProcessor) { + var bulkProcessor = + (BulkProcessor) ReflectionTestUtils.getField(esBulkProcessor, "bulkProcessor"); + var bulkRequestHandler = ReflectionTestUtils.getField(bulkProcessor, "bulkRequestHandler"); + var bulkProcessorListener = + (BulkProcessor.Listener) ReflectionTestUtils.getField(bulkRequestHandler, "listener"); + ReflectionTestUtils.setField( + bulkRequestHandler, "listener", new BulkProcessorProxyListener(bulkProcessorListener)); + } +} diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestUtils.java b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestUtils.java index a71c40b70f2b41..24df2afb3b7819 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestUtils.java +++ b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestUtils.java @@ -15,6 +15,7 @@ import com.linkedin.datahub.graphql.types.SearchableEntityType; import com.linkedin.datahub.graphql.types.entitytype.EntityTypeMapper; import com.linkedin.metadata.aspect.AspectRetriever; +import com.linkedin.metadata.config.search.GraphQueryConfiguration; import com.linkedin.metadata.graph.LineageDirection; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.search.LineageSearchResult; @@ -24,6 +25,7 @@ import com.linkedin.metadata.search.SearchService; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; import io.datahubproject.metadata.context.OperationContext; +import java.io.IOException; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -41,9 +43,9 @@ public class SearchTestUtils { private SearchTestUtils() {} - public static void syncAfterWrite(ESBulkProcessor bulkProcessor) throws InterruptedException { - bulkProcessor.flush(); - Thread.sleep(1000); + public static void syncAfterWrite(ESBulkProcessor bulkProcessor) + throws InterruptedException, IOException { + BulkProcessorTestUtils.syncAfterWrite(bulkProcessor); } public static final List SEARCHABLE_ENTITIES; @@ -253,4 +255,16 @@ public HttpAsyncClientBuilder customizeHttpClient( } }); } + + public static GraphQueryConfiguration getGraphQueryConfiguration() { + return new GraphQueryConfiguration() { + { + setBatchSize(1000); + setTimeoutSeconds(10); + setMaxResult(10000); + setEnableMultiPathSearch(true); + setBoostViaNodes(true); + } + }; + } } diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchTestContainerConfiguration.java b/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchTestContainerConfiguration.java index ab6644ce6ff6de..66394def5f99b5 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchTestContainerConfiguration.java +++ b/metadata-io/src/test/java/io/datahubproject/test/search/config/SearchTestContainerConfiguration.java @@ -1,5 +1,7 @@ package io.datahubproject.test.search.config; +import static io.datahubproject.test.search.BulkProcessorTestUtils.replaceBulkProcessorListener; + import com.linkedin.metadata.config.search.ElasticSearchConfiguration; import com.linkedin.metadata.search.elasticsearch.indexbuilder.ESIndexBuilder; import com.linkedin.metadata.search.elasticsearch.update.ESBulkProcessor; @@ -64,18 +66,21 @@ public RestHighLevelClient getElasticsearchClient( @Nonnull public ESBulkProcessor getBulkProcessor( @Qualifier("searchRestHighLevelClient") RestHighLevelClient searchClient) { - return ESBulkProcessor.builder(searchClient) - .async(true) - /* - * Force a refresh as part of this request. This refresh policy does not scale for high indexing or search throughput but is useful - * to present a consistent view to for indices with very low traffic. And it is wonderful for tests! - */ - .writeRequestRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE) - .bulkRequestsLimit(10000) - .bulkFlushPeriod(REFRESH_INTERVAL_SECONDS - 1) - .retryInterval(1L) - .numRetries(1) - .build(); + ESBulkProcessor esBulkProcessor = + ESBulkProcessor.builder(searchClient) + .async(true) + /* + * Force a refresh as part of this request. This refresh policy does not scale for high indexing or search throughput but is useful + * to present a consistent view to for indices with very low traffic. And it is wonderful for tests! + */ + .writeRequestRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE) + .bulkRequestsLimit(10000) + .bulkFlushPeriod(REFRESH_INTERVAL_SECONDS - 1) + .retryInterval(1L) + .numRetries(1) + .build(); + replaceBulkProcessorListener(esBulkProcessor); + return esBulkProcessor; } @Primary diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java index cd869a61bf3abb..7a4af8c24262ed 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/GraphQueryConfiguration.java @@ -17,15 +17,4 @@ public class GraphQueryConfiguration { * to be prioritized in the case of a multiple path situation with multi-path search disabled */ private boolean boostViaNodes; - - public static GraphQueryConfiguration testDefaults; - - static { - testDefaults = new GraphQueryConfiguration(); - testDefaults.setBatchSize(1000); - testDefaults.setTimeoutSeconds(10); - testDefaults.setMaxResult(10000); - testDefaults.setEnableMultiPathSearch(true); - testDefaults.setBoostViaNodes(true); - } } From 66ecfae5e48b27c8477ccec94b82db8a51322811 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20L=C3=BCdin?= <13187726+Masterchen09@users.noreply.github.com> Date: Thu, 1 Aug 2024 10:19:32 +0200 Subject: [PATCH 12/16] feat(cli): add aspects parameter to DataHubGraph.get_entity_semityped (#11009) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/graph/client.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 4172c89fbfacff..55bd0b3cf0afc1 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -501,8 +501,8 @@ def get_aspects_for_entity( responds to these calls, and will be fixed automatically when the server-side issue is fixed. :param str entity_urn: The urn of the entity - :param List[Type[Aspect]] aspect_type_list: List of aspect type classes being requested (e.g. [datahub.metadata.schema_classes.DatasetProperties]) - :param List[str] aspects_list: List of aspect names being requested (e.g. [schemaMetadata, datasetProperties]) + :param aspects: List of aspect names being requested (e.g. [schemaMetadata, datasetProperties]) + :param aspect_types: List of aspect type classes being requested (e.g. [datahub.metadata.schema_classes.DatasetProperties]) :return: Optionally, a map of aspect_name to aspect_value as a dictionary if present, aspect_value will be set to None if that aspect was not found. Returns None on HTTP status 404. :raises HttpError: if the HTTP response is not a 200 """ @@ -527,8 +527,10 @@ def get_aspects_for_entity( return result - def get_entity_semityped(self, entity_urn: str) -> AspectBag: - """Get all non-timeseries aspects for an entity (experimental). + def get_entity_semityped( + self, entity_urn: str, aspects: Optional[List[str]] = None + ) -> AspectBag: + """Get (all) non-timeseries aspects for an entity. This method is called "semityped" because it returns aspects as a dictionary of properly typed objects. While the returned dictionary is constrained using a TypedDict, @@ -538,11 +540,12 @@ def get_entity_semityped(self, entity_urn: str) -> AspectBag: something, even if the entity doesn't actually exist in DataHub. :param entity_urn: The urn of the entity + :param aspects: Optional list of aspect names being requested (e.g. ["schemaMetadata", "datasetProperties"]) :returns: A dictionary of aspect name to aspect value. If an aspect is not found, it will not be present in the dictionary. The entity's key aspect will always be present. """ - response_json = self.get_entity_raw(entity_urn) + response_json = self.get_entity_raw(entity_urn, aspects) # Now, we parse the response into proper aspect objects. result: AspectBag = {} From 23690320779e34fcb694df807f5dbce32395ee47 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 1 Aug 2024 10:05:49 -0700 Subject: [PATCH 13/16] docs(airflow): update min version for plugin v2 (#11065) --- docs/lineage/airflow.md | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 9d838ef8a44042..2d7707637e2d1c 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -17,7 +17,7 @@ There's two actively supported implementations of the plugin, with different Air | Approach | Airflow Version | Notes | | --------- | --------------- | --------------------------------------------------------------------------- | -| Plugin v2 | 2.3+ | Recommended. Requires Python 3.8+ | +| Plugin v2 | 2.3.4+ | Recommended. Requires Python 3.8+ | | Plugin v1 | 2.1+ | No automatic lineage extraction; may not extract lineage if the task fails. | If you're using Airflow older than 2.1, it's possible to use the v1 plugin with older versions of `acryl-datahub-airflow-plugin`. See the [compatibility section](#compatibility) for more details. @@ -66,7 +66,7 @@ enabled = True # default ``` | Name | Default value | Description | -|----------------------------|----------------------|------------------------------------------------------------------------------------------| +| -------------------------- | -------------------- | ---------------------------------------------------------------------------------------- | | enabled | true | If the plugin should be enabled. | | conn_id | datahub_rest_default | The name of the datahub rest connection. | | cluster | prod | name of the airflow cluster, this is equivalent to the `env` of the instance | @@ -132,7 +132,7 @@ conn_id = datahub_rest_default # or datahub_kafka_default ``` | Name | Default value | Description | -|----------------------------|----------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| -------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | enabled | true | If the plugin should be enabled. | | conn_id | datahub_rest_default | The name of the datahub connection you set in step 1. | | cluster | prod | name of the airflow cluster | @@ -240,6 +240,7 @@ See this [example PR](https://github.com/datahub-project/datahub/pull/10452) whi There might be a case where the DAGs are removed from the Airflow but the corresponding pipelines and tasks are still there in the Datahub, let's call such pipelines ans tasks, `obsolete pipelines and tasks` Following are the steps to cleanup them from the datahub: + - create a DAG named `Datahub_Cleanup`, i.e. ```python @@ -263,8 +264,8 @@ with DAG( ) ``` -- ingest this DAG, and it will remove all the obsolete pipelines and tasks from the Datahub based on the `cluster` value set in the `airflow.cfg` +- ingest this DAG, and it will remove all the obsolete pipelines and tasks from the Datahub based on the `cluster` value set in the `airflow.cfg` ## Get all dataJobs associated with a dataFlow @@ -274,12 +275,7 @@ If you are looking to find all tasks (aka DataJobs) that belong to a specific pi query { dataFlow(urn: "urn:li:dataFlow:(airflow,db_etl,prod)") { childJobs: relationships( - input: { - types: ["IsPartOf"], - direction: INCOMING, - start: 0, - count: 100 - } + input: { types: ["IsPartOf"], direction: INCOMING, start: 0, count: 100 } ) { total relationships { From d5eda0de7e76df5c7503b6a3a57a578c271235ac Mon Sep 17 00:00:00 2001 From: sid-acryl <155424659+sid-acryl@users.noreply.github.com> Date: Thu, 1 Aug 2024 22:56:47 +0530 Subject: [PATCH 14/16] doc(ingestion/tableau): doc update for derived permission (#11054) Co-authored-by: Pedro Silva Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Co-authored-by: Harshal Sheth --- docs/quick-ingestion-guides/tableau/setup.md | 8 ++++++++ metadata-ingestion/docs/sources/tableau/tableau_pre.md | 9 +++++++++ 2 files changed, 17 insertions(+) diff --git a/docs/quick-ingestion-guides/tableau/setup.md b/docs/quick-ingestion-guides/tableau/setup.md index b6ccaf2a9cc9e5..81767215d5bcd5 100644 --- a/docs/quick-ingestion-guides/tableau/setup.md +++ b/docs/quick-ingestion-guides/tableau/setup.md @@ -51,6 +51,14 @@ In order to configure ingestion from Tableau, you'll first have to enable Tablea - Open a command prompt as an admin on the initial node (*where TSM is installed*) in the cluster - Run the command: `tsm maintenance metadata-services enable` +3. **Enable Derived Permissions:** This step is required only when the site is using external assets. For more detail, refer to the tableau documentation [Manage Permissions for External Assets](https://help.tableau.com/current/online/en-us/dm_perms_assets.htm). + + Follow the below steps to enable the derived permissions: + + - Sign in to Tableau Cloud or Tableau Server as an admin. + - From the left navigation pane, click Settings. + - On the General tab, under Automatic Access to Metadata about Databases and Tables, select the `Automatically grant authorized users access to metadata about databases and tables` check box. + ## Next Steps diff --git a/metadata-ingestion/docs/sources/tableau/tableau_pre.md b/metadata-ingestion/docs/sources/tableau/tableau_pre.md index 5e323da6746d21..aeb67f85b241b9 100644 --- a/metadata-ingestion/docs/sources/tableau/tableau_pre.md +++ b/metadata-ingestion/docs/sources/tableau/tableau_pre.md @@ -81,3 +81,12 @@ This may happen when the Tableau API returns NODE_LIMIT_EXCEEDED error in respon - reducing the page size using the `page_size` config param in datahub recipe (Defaults to 10). - increasing tableau configuration [metadata query node limit](https://help.tableau.com/current/server/en-us/cli_configuration-set_tsm.htm#metadata_nodelimit) to higher value. + +### `PERMISSIONS_MODE_SWITCHED` error in ingestion report +This error occurs if the Tableau site is using external assets. For more detail, refer to the Tableau documentation [Manage Permissions for External Assets](https://help.tableau.com/current/online/en-us/dm_perms_assets.htm). + +Follow the below steps to enable the derived permissions: + +1. Sign in to Tableau Cloud or Tableau Server as an admin. +2. From the left navigation pane, click Settings. +3. On the General tab, under Automatic Access to Metadata about Databases and Tables, select the `Automatically grant authorized users access to metadata about databases and tables` check box. From f78b6c08fbe606410271a26e359b165dced217cd Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Fri, 2 Aug 2024 04:57:54 -0700 Subject: [PATCH 15/16] fix(py): remove dep on types-pkg_resources (#11076) --- metadata-ingestion-modules/airflow-plugin/setup.py | 2 +- metadata-ingestion-modules/dagster-plugin/setup.py | 4 ++-- metadata-ingestion/setup.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 6d5aa74b1d96ff..2401b169cd6607 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -53,7 +53,7 @@ def get_long_description(): mypy_stubs = { "types-dataclasses", "sqlalchemy-stubs", - "types-pkg_resources", + "types-setuptools", "types-six", "types-python-dateutil", "types-requests", diff --git a/metadata-ingestion-modules/dagster-plugin/setup.py b/metadata-ingestion-modules/dagster-plugin/setup.py index 60b960e653eb2b..8a2a1d76d345bf 100644 --- a/metadata-ingestion-modules/dagster-plugin/setup.py +++ b/metadata-ingestion-modules/dagster-plugin/setup.py @@ -26,14 +26,14 @@ def get_long_description(): "dagit >= 1.3.3", *rest_common, # Ignoring the dependency below because it causes issues with the vercel built wheel install - #f"acryl-datahub[datahub-rest]{_self_pin}", + # f"acryl-datahub[datahub-rest]{_self_pin}", "acryl-datahub[datahub-rest]", } mypy_stubs = { "types-dataclasses", "sqlalchemy-stubs", - "types-pkg_resources", + "types-setuptools", "types-six", "types-python-dateutil", "types-requests", diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 20a43a94f6bdaf..445600b8abd48b 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -491,7 +491,7 @@ mypy_stubs = { "types-dataclasses", - "types-pkg_resources", + "types-setuptools", "types-six", "types-python-dateutil", # We need to avoid 2.31.0.5 and 2.31.0.4 due to From f2e461eb633c0bdd6196f94320ca09cb06fd360b Mon Sep 17 00:00:00 2001 From: Aseem Bansal Date: Fri, 2 Aug 2024 21:01:46 +0530 Subject: [PATCH 16/16] feat(ingest/mode): add option to exclude restricted (#11081) --- .../src/datahub/ingestion/source/mode.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index 4b4822bcb98cae..3da7f98e930083 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -135,9 +135,14 @@ class ModeConfig(StatefulIngestionConfigBase, DatasetLineageProviderConfigBase): connect_uri: str = Field( default="https://app.mode.com", description="Mode host URL." ) - token: str = Field(description="Mode user token.") + token: str = Field( + description="When creating workspace API key this is the 'Key ID'." + ) password: pydantic.SecretStr = Field( - description="Mode password for authentication." + description="When creating workspace API key this is the 'Secret'." + ) + exclude_restricted: bool = Field( + default=False, description="Exclude restricted collections" ) workspace: str = Field( @@ -522,6 +527,16 @@ def _get_space_name_and_tokens(self) -> dict: for s in spaces: logger.debug(f"Space: {s.get('name')}") space_name = s.get("name", "") + # Using both restricted and default_access_level because + # there is a current bug with restricted returning False everytime + # which has been reported to Mode team + if self.config.exclude_restricted and ( + s.get("restricted") or s.get("default_access_level") == "restricted" + ): + logging.debug( + f"Skipping space {space_name} due to exclude restricted" + ) + continue if not self.config.space_pattern.allowed(space_name): self.report.report_dropped_space(space_name) logging.debug(f"Skipping space {space_name} due to space pattern")