From ec9725026dca7b89d6a6464ea9b5c547debf42e5 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Thu, 2 Nov 2023 09:39:08 -0700
Subject: [PATCH 01/33] chore(ingest): remove legacy memory_leak_detector
(#9158)
---
.../src/datahub/cli/ingest_cli.py | 4 -
metadata-ingestion/src/datahub/entrypoints.py | 15 ---
.../ingestion/source/looker/looker_config.py | 6 +-
.../datahub/utilities/memory_leak_detector.py | 106 ------------------
.../tests/integration/snowflake/common.py | 3 +-
.../tests/unit/test_snowflake_source.py | 15 +--
6 files changed, 10 insertions(+), 139 deletions(-)
delete mode 100644 metadata-ingestion/src/datahub/utilities/memory_leak_detector.py
diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py
index 9b5716408f3e43..dd0287004a3686 100644
--- a/metadata-ingestion/src/datahub/cli/ingest_cli.py
+++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py
@@ -27,7 +27,6 @@
from datahub.ingestion.run.pipeline import Pipeline
from datahub.telemetry import telemetry
from datahub.upgrade import upgrade
-from datahub.utilities import memory_leak_detector
logger = logging.getLogger(__name__)
@@ -98,7 +97,6 @@ def ingest() -> None:
@click.option(
"--no-spinner", type=bool, is_flag=True, default=False, help="Turn off spinner"
)
-@click.pass_context
@telemetry.with_telemetry(
capture_kwargs=[
"dry_run",
@@ -109,9 +107,7 @@ def ingest() -> None:
"no_spinner",
]
)
-@memory_leak_detector.with_leak_detection
def run(
- ctx: click.Context,
config: str,
dry_run: bool,
preview: bool,
diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py
index 5bfab3b841fa38..0cd37cc9398549 100644
--- a/metadata-ingestion/src/datahub/entrypoints.py
+++ b/metadata-ingestion/src/datahub/entrypoints.py
@@ -70,21 +70,10 @@
version=datahub_package.nice_version_name(),
prog_name=datahub_package.__package_name__,
)
-@click.option(
- "-dl",
- "--detect-memory-leaks",
- type=bool,
- is_flag=True,
- default=False,
- help="Run memory leak detection.",
-)
-@click.pass_context
def datahub(
- ctx: click.Context,
debug: bool,
log_file: Optional[str],
debug_vars: bool,
- detect_memory_leaks: bool,
) -> None:
if debug_vars:
# debug_vars implies debug. This option isn't actually used here, but instead
@@ -109,10 +98,6 @@ def datahub(
_logging_configured = configure_logging(debug=debug, log_file=log_file)
_logging_configured.__enter__()
- # Setup the context for the memory_leak_detector decorator.
- ctx.ensure_object(dict)
- ctx.obj["detect_memory_leaks"] = detect_memory_leaks
-
@datahub.command()
@telemetry.with_telemetry()
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
index 96c405f7257d04..98d58c9fc9d87f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
@@ -121,7 +121,10 @@ class LookerCommonConfig(DatasetSourceConfigMixin):
"discoverable. When disabled, adds this information to the description of the column.",
)
platform_name: str = Field(
- "looker", description="Default platform name. Don't change."
+ # TODO: This shouldn't be part of the config.
+ "looker",
+ description="Default platform name.",
+ hidden_from_docs=True,
)
extract_column_level_lineage: bool = Field(
True,
@@ -213,7 +216,6 @@ def external_url_defaults_to_api_config_base_url(
def stateful_ingestion_should_be_enabled(
cls, v: Optional[bool], *, values: Dict[str, Any], **kwargs: Dict[str, Any]
) -> Optional[bool]:
-
stateful_ingestion: StatefulStaleMetadataRemovalConfig = cast(
StatefulStaleMetadataRemovalConfig, values.get("stateful_ingestion")
)
diff --git a/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py b/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py
deleted file mode 100644
index 85ad0fb4938eb8..00000000000000
--- a/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import fnmatch
-import gc
-import logging
-import sys
-import tracemalloc
-from collections import defaultdict
-from functools import wraps
-from typing import Any, Callable, Dict, List, TypeVar, Union, cast
-
-import click
-from typing_extensions import Concatenate, ParamSpec
-
-logger = logging.getLogger(__name__)
-T = TypeVar("T")
-P = ParamSpec("P")
-
-
-def _trace_has_file(trace: tracemalloc.Traceback, file_pattern: str) -> bool:
- for frame_index in range(len(trace)):
- cur_frame = trace[frame_index]
- if fnmatch.fnmatch(cur_frame.filename, file_pattern):
- return True
- return False
-
-
-def _init_leak_detection() -> None:
- # Initialize trace malloc to track up to 25 stack frames.
- tracemalloc.start(25)
- if sys.version_info >= (3, 9):
- # Nice to reset peak to 0. Available for versions >= 3.9.
- tracemalloc.reset_peak()
- # Enable leak debugging in the garbage collector.
- gc.set_debug(gc.DEBUG_LEAK)
-
-
-def _perform_leak_detection() -> None:
- # Log potentially useful memory usage metrics
- logger.info(f"GC count before collect {gc.get_count()}")
- traced_memory_size, traced_memory_peak = tracemalloc.get_traced_memory()
- logger.info(f"Traced Memory: size={traced_memory_size}, peak={traced_memory_peak}")
- num_unreacheable_objects = gc.collect()
- logger.info(f"Number of unreachable objects = {num_unreacheable_objects}")
- logger.info(f"GC count after collect {gc.get_count()}")
-
- # Collect unique traces of all live objects in the garbage - these have potential leaks.
- unique_traces_to_objects: Dict[
- Union[tracemalloc.Traceback, int], List[object]
- ] = defaultdict(list)
- for obj in gc.garbage:
- obj_trace = tracemalloc.get_object_traceback(obj)
- if obj_trace is not None:
- if _trace_has_file(obj_trace, "*datahub/*.py"):
- # Leaking object
- unique_traces_to_objects[obj_trace].append(obj)
- else:
- unique_traces_to_objects[id(obj)].append(obj)
- logger.info("Potentially leaking objects start")
- for key, obj_list in sorted(
- unique_traces_to_objects.items(),
- key=lambda item: sum(
- [sys.getsizeof(o) for o in item[1]]
- ), # TODO: add support for deep sizeof
- reverse=True,
- ):
- if isinstance(key, tracemalloc.Traceback):
- obj_traceback: tracemalloc.Traceback = cast(tracemalloc.Traceback, key)
- logger.info(
- f"#Objects:{len(obj_list)}; Total memory:{sum([sys.getsizeof(obj) for obj in obj_list])};"
- + " Allocation Trace:\n\t"
- + "\n\t".join(obj_traceback.format(limit=25))
- )
- else:
- logger.info(
- f"#Objects:{len(obj_list)}; Total memory:{sum([sys.getsizeof(obj) for obj in obj_list])};"
- + " No Allocation Trace available!"
- )
- logger.info("Potentially leaking objects end")
-
- tracemalloc.stop()
-
-
-def with_leak_detection(
- func: Callable[Concatenate[click.Context, P], T]
-) -> Callable[Concatenate[click.Context, P], T]:
- @wraps(func)
- def wrapper(ctx: click.Context, *args: P.args, **kwargs: P.kwargs) -> Any:
- detect_leaks: bool = ctx.obj.get("detect_memory_leaks", False)
- if detect_leaks:
- logger.info(
- f"Initializing memory leak detection on command: {func.__module__}.{func.__name__}"
- )
- _init_leak_detection()
-
- try:
- return func(ctx, *args, **kwargs)
- finally:
- if detect_leaks:
- logger.info(
- f"Starting memory leak detection on command: {func.__module__}.{func.__name__}"
- )
- _perform_leak_detection()
- logger.info(
- f"Finished memory leak detection on command: {func.__module__}.{func.__name__}"
- )
-
- return wrapper
diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py
index ff448eca01071f..78e54996973119 100644
--- a/metadata-ingestion/tests/integration/snowflake/common.py
+++ b/metadata-ingestion/tests/integration/snowflake/common.py
@@ -565,5 +565,4 @@ def default_query_results( # noqa: C901
"DOMAIN": "DATABASE",
},
]
- # Unreachable code
- raise Exception(f"Unknown query {query}")
+ raise ValueError(f"Unexpected query: {query}")
diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py
index 888a7c04415542..aaff878b81eeef 100644
--- a/metadata-ingestion/tests/unit/test_snowflake_source.py
+++ b/metadata-ingestion/tests/unit/test_snowflake_source.py
@@ -368,8 +368,7 @@ def default_query_results(query):
return [('{"roles":"","value":""}',)]
elif query == "select current_warehouse()":
return [("TEST_WAREHOUSE")]
- # Unreachable code
- raise Exception()
+ raise ValueError(f"Unexpected query: {query}")
connection_mock = MagicMock()
cursor_mock = MagicMock()
@@ -397,8 +396,7 @@ def query_results(query):
]
elif query == 'show grants to role "PUBLIC"':
return []
- # Unreachable code
- raise Exception()
+ raise ValueError(f"Unexpected query: {query}")
config = {
"username": "user",
@@ -441,8 +439,7 @@ def query_results(query):
return [("", "USAGE", "DATABASE", "DB1")]
elif query == 'show grants to role "PUBLIC"':
return []
- # Unreachable code
- raise Exception()
+ raise ValueError(f"Unexpected query: {query}")
setup_mock_connect(mock_connect, query_results)
@@ -485,8 +482,7 @@ def query_results(query):
]
elif query == 'show grants to role "PUBLIC"':
return []
- # Unreachable code
- raise Exception()
+ raise ValueError(f"Unexpected query: {query}")
setup_mock_connect(mock_connect, query_results)
@@ -536,8 +532,7 @@ def query_results(query):
["", "USAGE", "VIEW", "SNOWFLAKE.ACCOUNT_USAGE.ACCESS_HISTORY"],
["", "USAGE", "VIEW", "SNOWFLAKE.ACCOUNT_USAGE.OBJECT_DEPENDENCIES"],
]
- # Unreachable code
- raise Exception()
+ raise ValueError(f"Unexpected query: {query}")
setup_mock_connect(mock_connect, query_results)
From 148ad1ad9f00d6eb43d6acb270b9a90a745c8af3 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Thu, 2 Nov 2023 09:44:35 -0700
Subject: [PATCH 02/33] feat(ingest/looker): support emitting unused explores
(#9159)
---
.../ingestion/source/looker/looker_common.py | 2 +-
.../ingestion/source/looker/looker_config.py | 4 ++
.../source/looker/looker_lib_wrapper.py | 7 +++
.../ingestion/source/looker/looker_source.py | 46 +++++++++++++------
4 files changed, 45 insertions(+), 14 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
index 30c38720dd96c4..7ca5ce49019abd 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py
@@ -388,7 +388,7 @@ def _get_field_type(
# if still not found, log and continue
if type_class is None:
- logger.info(
+ logger.debug(
f"The type '{native_type}' is not recognized for field type, setting as NullTypeClass.",
)
type_class = NullTypeClass
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
index 98d58c9fc9d87f..e6ddea9a30489e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py
@@ -205,6 +205,10 @@ class LookerDashboardSourceConfig(
False,
description="Extract looks which are not part of any Dashboard. To enable this flag the stateful_ingestion should also be enabled.",
)
+ emit_used_explores_only: bool = Field(
+ True,
+ description="When enabled, only explores that are used by a Dashboard/Look will be ingested.",
+ )
@validator("external_base_url", pre=True, always=True)
def external_url_defaults_to_api_config_base_url(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
index b00f74b71e7922..988caba1c0d748 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py
@@ -59,6 +59,7 @@ class LookerAPIStats(BaseModel):
lookml_model_calls: int = 0
all_dashboards_calls: int = 0
all_looks_calls: int = 0
+ all_models_calls: int = 0
get_query_calls: int = 0
search_looks_calls: int = 0
search_dashboards_calls: int = 0
@@ -155,6 +156,12 @@ def dashboard(self, dashboard_id: str, fields: Union[str, List[str]]) -> Dashboa
transport_options=self.transport_options,
)
+ def all_lookml_models(self) -> Sequence[LookmlModel]:
+ self.client_stats.all_models_calls += 1
+ return self.client.all_lookml_models(
+ transport_options=self.transport_options,
+ )
+
def lookml_model_explore(self, model: str, explore_name: str) -> LookmlModelExplore:
self.client_stats.explore_calls += 1
return self.client.lookml_model_explore(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
index 09683d790c14c7..4a98e8874bca0d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py
@@ -147,9 +147,12 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext):
)
self.reporter._looker_explore_registry = self.explore_registry
self.reporter._looker_api = self.looker_api
+
self.reachable_look_registry = set()
- self.explores_to_fetch_set: Dict[Tuple[str, str], List[str]] = {}
+ # (model, explore) -> list of charts/looks/dashboards that reference this explore
+ # The list values are used purely for debugging purposes.
+ self.reachable_explores: Dict[Tuple[str, str], List[str]] = {}
# Keep stat generators to generate entity stat aspect later
stat_generator_config: looker_usage.StatGeneratorConfig = (
@@ -378,11 +381,11 @@ def _get_input_fields_from_query(
return result
- def add_explore_to_fetch(self, model: str, explore: str, via: str) -> None:
- if (model, explore) not in self.explores_to_fetch_set:
- self.explores_to_fetch_set[(model, explore)] = []
+ def add_reachable_explore(self, model: str, explore: str, via: str) -> None:
+ if (model, explore) not in self.reachable_explores:
+ self.reachable_explores[(model, explore)] = []
- self.explores_to_fetch_set[(model, explore)].append(via)
+ self.reachable_explores[(model, explore)].append(via)
def _get_looker_dashboard_element( # noqa: C901
self, element: DashboardElement
@@ -403,7 +406,7 @@ def _get_looker_dashboard_element( # noqa: C901
f"Element {element.title}: Explores added via query: {explores}"
)
for exp in explores:
- self.add_explore_to_fetch(
+ self.add_reachable_explore(
model=element.query.model,
explore=exp,
via=f"look:{element.look_id}:query:{element.dashboard_id}",
@@ -439,7 +442,7 @@ def _get_looker_dashboard_element( # noqa: C901
explores = [element.look.query.view]
logger.debug(f"Element {title}: Explores added via look: {explores}")
for exp in explores:
- self.add_explore_to_fetch(
+ self.add_reachable_explore(
model=element.look.query.model,
explore=exp,
via=f"Look:{element.look_id}:query:{element.dashboard_id}",
@@ -483,7 +486,7 @@ def _get_looker_dashboard_element( # noqa: C901
)
for exp in explores:
- self.add_explore_to_fetch(
+ self.add_reachable_explore(
model=element.result_maker.query.model,
explore=exp,
via=f"Look:{element.look_id}:resultmaker:query",
@@ -495,7 +498,7 @@ def _get_looker_dashboard_element( # noqa: C901
if filterable.view is not None and filterable.model is not None:
model = filterable.model
explores.append(filterable.view)
- self.add_explore_to_fetch(
+ self.add_reachable_explore(
model=filterable.model,
explore=filterable.view,
via=f"Look:{element.look_id}:resultmaker:filterable",
@@ -694,20 +697,26 @@ def _make_dashboard_metadata_events(
def _make_explore_metadata_events(
self,
) -> Iterable[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]:
+ if self.source_config.emit_used_explores_only:
+ explores_to_fetch = list(self.reachable_explores.keys())
+ else:
+ explores_to_fetch = list(self.list_all_explores())
+ explores_to_fetch.sort()
+
with concurrent.futures.ThreadPoolExecutor(
max_workers=self.source_config.max_threads
) as async_executor:
- self.reporter.total_explores = len(self.explores_to_fetch_set)
+ self.reporter.total_explores = len(explores_to_fetch)
explore_futures = {
async_executor.submit(self.fetch_one_explore, model, explore): (
model,
explore,
)
- for (model, explore) in self.explores_to_fetch_set
+ for (model, explore) in explores_to_fetch
}
- for future in concurrent.futures.as_completed(explore_futures):
+ for future in concurrent.futures.wait(explore_futures).done:
events, explore_id, start_time, end_time = future.result()
del explore_futures[future]
self.reporter.explores_scanned += 1
@@ -717,6 +726,17 @@ def _make_explore_metadata_events(
f"Running time of fetch_one_explore for {explore_id}: {(end_time - start_time).total_seconds()}"
)
+ def list_all_explores(self) -> Iterable[Tuple[str, str]]:
+ # returns a list of (model, explore) tuples
+
+ for model in self.looker_api.all_lookml_models():
+ if model.name is None or model.explores is None:
+ continue
+ for explore in model.explores:
+ if explore.name is None:
+ continue
+ yield (model.name, explore.name)
+
def fetch_one_explore(
self, model: str, explore: str
) -> Tuple[
@@ -954,7 +974,7 @@ def _input_fields_from_dashboard_element(
)
if explore is not None:
# add this to the list of explores to finally generate metadata for
- self.add_explore_to_fetch(
+ self.add_reachable_explore(
input_field.model, input_field.explore, entity_urn
)
entity_urn = explore.get_explore_urn(self.source_config)
From 7ff48b37aaea165ba3c3cb6f9f9f742ea2e37654 Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Fri, 3 Nov 2023 10:23:37 -0500
Subject: [PATCH 03/33] refactor(policy): refactor policy locking, no
functional difference (#9163)
---
.../authorization/DataHubAuthorizer.java | 111 +++++++++---------
1 file changed, 55 insertions(+), 56 deletions(-)
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
index e30fb93109915a..f8b28f6c182a72 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
@@ -19,6 +19,7 @@
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
+import java.util.concurrent.locks.Lock;
import java.util.concurrent.locks.ReadWriteLock;
import java.util.concurrent.locks.ReentrantReadWriteLock;
import javax.annotation.Nonnull;
@@ -55,7 +56,8 @@ public enum AuthorizationMode {
// Maps privilege name to the associated set of policies for fast access.
// Not concurrent data structure because writes are always against the entire thing.
private final Map> _policyCache = new HashMap<>(); // Shared Policy Cache.
- private final ReadWriteLock _lockPolicyCache = new ReentrantReadWriteLock();
+ private final ReadWriteLock readWriteLock = new ReentrantReadWriteLock();
+ private final Lock readLock = readWriteLock.readLock();
private final ScheduledExecutorService _refreshExecutorService = Executors.newScheduledThreadPool(1);
private final PolicyRefreshRunnable _policyRefreshRunnable;
@@ -74,7 +76,7 @@ public DataHubAuthorizer(
_systemAuthentication = Objects.requireNonNull(systemAuthentication);
_mode = Objects.requireNonNull(mode);
_policyEngine = new PolicyEngine(systemAuthentication, Objects.requireNonNull(entityClient));
- _policyRefreshRunnable = new PolicyRefreshRunnable(systemAuthentication, new PolicyFetcher(entityClient), _policyCache, _lockPolicyCache);
+ _policyRefreshRunnable = new PolicyRefreshRunnable(systemAuthentication, new PolicyFetcher(entityClient), _policyCache, readWriteLock.writeLock());
_refreshExecutorService.scheduleAtFixedRate(_policyRefreshRunnable, delayIntervalSeconds, refreshIntervalSeconds, TimeUnit.SECONDS);
}
@@ -93,41 +95,30 @@ public AuthorizationResult authorize(@Nonnull final AuthorizationRequest request
Optional resolvedResourceSpec = request.getResourceSpec().map(_entitySpecResolver::resolve);
- _lockPolicyCache.readLock().lock();
- try {
- // 1. Fetch the policies relevant to the requested privilege.
- final List policiesToEvaluate = _policyCache.getOrDefault(request.getPrivilege(), new ArrayList<>());
-
- // 2. Evaluate each policy.
- for (DataHubPolicyInfo policy : policiesToEvaluate) {
- if (isRequestGranted(policy, request, resolvedResourceSpec)) {
- // Short circuit if policy has granted privileges to this actor.
- return new AuthorizationResult(request, AuthorizationResult.Type.ALLOW,
- String.format("Granted by policy with type: %s", policy.getType()));
- }
+ // 1. Fetch the policies relevant to the requested privilege.
+ final List policiesToEvaluate = getOrDefault(request.getPrivilege(), new ArrayList<>());
+
+ // 2. Evaluate each policy.
+ for (DataHubPolicyInfo policy : policiesToEvaluate) {
+ if (isRequestGranted(policy, request, resolvedResourceSpec)) {
+ // Short circuit if policy has granted privileges to this actor.
+ return new AuthorizationResult(request, AuthorizationResult.Type.ALLOW,
+ String.format("Granted by policy with type: %s", policy.getType()));
}
- return new AuthorizationResult(request, AuthorizationResult.Type.DENY, null);
- } finally {
- _lockPolicyCache.readLock().unlock();
}
+ return new AuthorizationResult(request, AuthorizationResult.Type.DENY, null);
}
public List getGrantedPrivileges(final String actor, final Optional resourceSpec) {
+ // 1. Fetch all policies
+ final List policiesToEvaluate = getOrDefault(ALL, new ArrayList<>());
- _lockPolicyCache.readLock().lock();
- try {
- // 1. Fetch all policies
- final List policiesToEvaluate = _policyCache.getOrDefault(ALL, new ArrayList<>());
-
- Urn actorUrn = UrnUtils.getUrn(actor);
- final ResolvedEntitySpec resolvedActorSpec = _entitySpecResolver.resolve(new EntitySpec(actorUrn.getEntityType(), actor));
+ Urn actorUrn = UrnUtils.getUrn(actor);
+ final ResolvedEntitySpec resolvedActorSpec = _entitySpecResolver.resolve(new EntitySpec(actorUrn.getEntityType(), actor));
- Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve);
+ Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve);
- return _policyEngine.getGrantedPrivileges(policiesToEvaluate, resolvedActorSpec, resolvedResourceSpec);
- } finally {
- _lockPolicyCache.readLock().unlock();
- }
+ return _policyEngine.getGrantedPrivileges(policiesToEvaluate, resolvedActorSpec, resolvedResourceSpec);
}
/**
@@ -143,36 +134,31 @@ public AuthorizedActors authorizedActors(
boolean allUsers = false;
boolean allGroups = false;
- _lockPolicyCache.readLock().lock();
- try {
- // Step 1: Find policies granting the privilege.
- final List policiesToEvaluate = _policyCache.getOrDefault(privilege, new ArrayList<>());
-
- Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve);
+ // Step 1: Find policies granting the privilege.
+ final List policiesToEvaluate = getOrDefault(privilege, new ArrayList<>());
+ Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve);
- // Step 2: For each policy, determine whether the resource is a match.
- for (DataHubPolicyInfo policy : policiesToEvaluate) {
- if (!PoliciesConfig.ACTIVE_POLICY_STATE.equals(policy.getState())) {
- // Policy is not active, skip.
- continue;
- }
+ // Step 2: For each policy, determine whether the resource is a match.
+ for (DataHubPolicyInfo policy : policiesToEvaluate) {
+ if (!PoliciesConfig.ACTIVE_POLICY_STATE.equals(policy.getState())) {
+ // Policy is not active, skip.
+ continue;
+ }
- final PolicyEngine.PolicyActors matchingActors = _policyEngine.getMatchingActors(policy, resolvedResourceSpec);
+ final PolicyEngine.PolicyActors matchingActors = _policyEngine.getMatchingActors(policy, resolvedResourceSpec);
- // Step 3: For each matching policy, add actors that are authorized.
- authorizedUsers.addAll(matchingActors.getUsers());
- authorizedGroups.addAll(matchingActors.getGroups());
- if (matchingActors.allUsers()) {
- allUsers = true;
- }
- if (matchingActors.allGroups()) {
- allGroups = true;
- }
+ // Step 3: For each matching policy, add actors that are authorized.
+ authorizedUsers.addAll(matchingActors.getUsers());
+ authorizedGroups.addAll(matchingActors.getGroups());
+ if (matchingActors.allUsers()) {
+ allUsers = true;
+ }
+ if (matchingActors.allGroups()) {
+ allGroups = true;
}
- } finally {
- _lockPolicyCache.readLock().unlock();
}
+
// Step 4: Return all authorized users and groups.
return new AuthorizedActors(privilege, authorizedUsers, authorizedGroups, allUsers, allGroups);
}
@@ -234,6 +220,16 @@ private Optional getUrnFromRequestActor(String actor) {
}
}
+ private List getOrDefault(String key, List defaultValue) {
+ readLock.lock();
+ try {
+ return _policyCache.getOrDefault(key, defaultValue);
+ } finally {
+ // To unlock the acquired read thread
+ readLock.unlock();
+ }
+ }
+
/**
* A {@link Runnable} used to periodically fetch a new instance of the policies Cache.
*
@@ -247,7 +243,7 @@ static class PolicyRefreshRunnable implements Runnable {
private final Authentication _systemAuthentication;
private final PolicyFetcher _policyFetcher;
private final Map> _policyCache;
- private final ReadWriteLock _lockPolicyCache;
+ private final Lock writeLock;
@Override
public void run() {
@@ -274,13 +270,16 @@ public void run() {
return;
}
}
- _lockPolicyCache.writeLock().lock();
+
+ writeLock.lock();
try {
_policyCache.clear();
_policyCache.putAll(newCache);
} finally {
- _lockPolicyCache.writeLock().unlock();
+ // To unlock the acquired write thread
+ writeLock.unlock();
}
+
log.debug(String.format("Successfully fetched %s policies.", total));
} catch (Exception e) {
log.error("Caught exception while loading Policy cache. Will retry on next scheduled attempt.", e);
From 07311115c5ca436f64fad9c685cfc586cc5d4180 Mon Sep 17 00:00:00 2001
From: Kos Korchak <97058061+kkorchak@users.noreply.github.com>
Date: Fri, 3 Nov 2023 13:00:15 -0400
Subject: [PATCH 04/33] API test for managing access token privilege (#9167)
---
.../tests/privileges/test_privileges.py | 155 ++++++++++++++----
1 file changed, 127 insertions(+), 28 deletions(-)
diff --git a/smoke-test/tests/privileges/test_privileges.py b/smoke-test/tests/privileges/test_privileges.py
index 13d6b6cf3415a4..740311754678ef 100644
--- a/smoke-test/tests/privileges/test_privileges.py
+++ b/smoke-test/tests/privileges/test_privileges.py
@@ -52,6 +52,20 @@ def privileges_and_test_user_setup(admin_session):
wait_for_writes_to_sync()
+@tenacity.retry(
+ stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec)
+)
+def _ensure_cant_perform_action(session, json,assertion_key):
+ action_response = session.post(
+ f"{get_frontend_url()}/api/v2/graphql", json=json)
+ action_response.raise_for_status()
+ action_data = action_response.json()
+
+ assert action_data["errors"][0]["extensions"]["code"] == 403
+ assert action_data["errors"][0]["extensions"]["type"] == "UNAUTHORIZED"
+ assert action_data["data"][assertion_key] == None
+
+
@tenacity.retry(
stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec)
)
@@ -67,20 +81,6 @@ def _ensure_can_create_secret(session, json, urn):
assert secret_data["data"]["createSecret"] == urn
-@tenacity.retry(
- stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec)
-)
-def _ensure_cant_create_secret(session, json):
- create_secret_response = session.post(
- f"{get_frontend_url()}/api/v2/graphql", json=json)
- create_secret_response.raise_for_status()
- create_secret_data = create_secret_response.json()
-
- assert create_secret_data["errors"][0]["extensions"]["code"] == 403
- assert create_secret_data["errors"][0]["extensions"]["type"] == "UNAUTHORIZED"
- assert create_secret_data["data"]["createSecret"] == None
-
-
@tenacity.retry(
stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec)
)
@@ -99,17 +99,19 @@ def _ensure_can_create_ingestion_source(session, json):
@tenacity.retry(
- stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec)
+ stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec)
)
-def _ensure_cant_create_ingestion_source(session, json):
- create_source_response = session.post(
+def _ensure_can_create_access_token(session, json):
+ create_access_token_success = session.post(
f"{get_frontend_url()}/api/v2/graphql", json=json)
- create_source_response.raise_for_status()
- create_source_data = create_source_response.json()
+ create_access_token_success.raise_for_status()
+ ingestion_data = create_access_token_success.json()
- assert create_source_data["errors"][0]["extensions"]["code"] == 403
- assert create_source_data["errors"][0]["extensions"]["type"] == "UNAUTHORIZED"
- assert create_source_data["data"]["createIngestionSource"] == None
+ assert ingestion_data
+ assert ingestion_data["data"]
+ assert ingestion_data["data"]["createAccessToken"]
+ assert ingestion_data["data"]["createAccessToken"]["accessToken"] is not None
+ assert ingestion_data["data"]["createAccessToken"]["__typename"] == "AccessToken"
@pytest.mark.dependency(depends=["test_healthchecks"])
@@ -132,7 +134,7 @@ def test_privilege_to_create_and_manage_secrets():
}
},
}
- _ensure_cant_create_secret(user_session, create_secret)
+ _ensure_cant_perform_action(user_session, create_secret,"createSecret")
# Assign privileges to the new user to manage secrets
@@ -166,7 +168,7 @@ def test_privilege_to_create_and_manage_secrets():
remove_policy(policy_urn, admin_session)
# Ensure user can't create secret after policy is removed
- _ensure_cant_create_secret(user_session, create_secret)
+ _ensure_cant_perform_action(user_session, create_secret,"createSecret")
@pytest.mark.dependency(depends=["test_healthchecks"])
@@ -182,11 +184,18 @@ def test_privilege_to_create_and_manage_ingestion_source():
createIngestionSource(input: $input)\n}""",
"variables": {"input":{"type":"snowflake","name":"test","config":
{"recipe":
- "{\"source\":{\"type\":\"snowflake\",\"config\":{\"account_id\":null,\"include_table_lineage\":true,\"include_view_lineage\":true,\"include_tables\":true,\"include_views\":true,\"profiling\":{\"enabled\":true,\"profile_table_level_only\":true},\"stateful_ingestion\":{\"enabled\":true}}}}",
+ """{\"source\":{\"type\":\"snowflake\",\"config\":{
+ \"account_id\":null,
+ \"include_table_lineage\":true,
+ \"include_view_lineage\":true,
+ \"include_tables\":true,
+ \"include_views\":true,
+ \"profiling\":{\"enabled\":true,\"profile_table_level_only\":true},
+ \"stateful_ingestion\":{\"enabled\":true}}}}""",
"executorId":"default","debugMode":False,"extraArgs":[]}}},
}
- _ensure_cant_create_ingestion_source(user_session, create_ingestion_source)
+ _ensure_cant_perform_action(user_session, create_ingestion_source, "createIngestionSource")
# Assign privileges to the new user to manage ingestion source
@@ -201,7 +210,14 @@ def test_privilege_to_create_and_manage_ingestion_source():
updateIngestionSource(urn: $urn, input: $input)\n}""",
"variables": {"urn":ingestion_source_urn,
"input":{"type":"snowflake","name":"test updated",
- "config":{"recipe":"{\"source\":{\"type\":\"snowflake\",\"config\":{\"account_id\":null,\"include_table_lineage\":true,\"include_view_lineage\":true,\"include_tables\":true,\"include_views\":true,\"profiling\":{\"enabled\":true,\"profile_table_level_only\":true},\"stateful_ingestion\":{\"enabled\":true}}}}",
+ "config":{"recipe":"""{\"source\":{\"type\":\"snowflake\",\"config\":{
+ \"account_id\":null,
+ \"include_table_lineage\":true,
+ \"include_view_lineage\":true,
+ \"include_tables\":true,
+ \"include_views\":true,
+ \"profiling\":{\"enabled\":true,\"profile_table_level_only\":true},
+ \"stateful_ingestion\":{\"enabled\":true}}}}""",
"executorId":"default","debugMode":False,"extraArgs":[]}}}
}
@@ -238,4 +254,87 @@ def test_privilege_to_create_and_manage_ingestion_source():
remove_policy(policy_urn, admin_session)
# Ensure that user can't create ingestion source after policy is removed
- _ensure_cant_create_ingestion_source(user_session, create_ingestion_source)
\ No newline at end of file
+ _ensure_cant_perform_action(user_session, create_ingestion_source, "createIngestionSource")
+
+
+@pytest.mark.dependency(depends=["test_healthchecks"])
+def test_privilege_to_create_and_manage_access_tokens():
+
+ (admin_user, admin_pass) = get_admin_credentials()
+ admin_session = login_as(admin_user, admin_pass)
+ user_session = login_as("user", "user")
+
+
+ # Verify new user can't create access token
+ create_access_token = {
+ "query": """mutation createAccessToken($input: CreateAccessTokenInput!) {\n
+ createAccessToken(input: $input) {\n accessToken\n __typename\n }\n}\n""",
+ "variables": {"input":{"actorUrn":"urn:li:corpuser:user",
+ "type":"PERSONAL",
+ "duration":"ONE_MONTH",
+ "name":"test",
+ "description":"test"}}
+ }
+
+ _ensure_cant_perform_action(user_session, create_access_token,"createAccessToken")
+
+
+ # Assign privileges to the new user to create and manage access tokens
+ policy_urn = create_user_policy("urn:li:corpuser:user", ["MANAGE_ACCESS_TOKENS"], admin_session)
+
+
+ # Verify new user can create and manage access token(create, revoke)
+ # Create a access token
+ _ensure_can_create_access_token(user_session, create_access_token)
+
+
+ # List access tokens first to get token id
+ list_access_tokens = {
+ "query": """query listAccessTokens($input: ListAccessTokenInput!) {\n
+ listAccessTokens(input: $input) {\n
+ start\n count\n total\n tokens {\n urn\n type\n
+ id\n name\n description\n actorUrn\n ownerUrn\n
+ createdAt\n expiresAt\n __typename\n }\n __typename\n }\n}\n""",
+ "variables": {
+ "input":{
+ "start":0,"count":10,"filters":[{
+ "field":"ownerUrn",
+ "values":["urn:li:corpuser:user"]}]}
+ }
+ }
+
+ list_tokens_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=list_access_tokens)
+ list_tokens_response.raise_for_status()
+ list_tokens_data = list_tokens_response.json()
+
+ assert list_tokens_data
+ assert list_tokens_data["data"]
+ assert list_tokens_data["data"]["listAccessTokens"]["tokens"][0]["id"] is not None
+
+ access_token_id = list_tokens_data["data"]["listAccessTokens"]["tokens"][0]["id"]
+
+
+ # Revoke access token
+ revoke_access_token = {
+ "query": "mutation revokeAccessToken($tokenId: String!) {\n revokeAccessToken(tokenId: $tokenId)\n}\n",
+ "variables": {
+ "tokenId": access_token_id
+ },
+ }
+
+ revoke_token_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=revoke_access_token)
+ revoke_token_response.raise_for_status()
+ revoke_token_data = revoke_token_response.json()
+
+ assert revoke_token_data
+ assert revoke_token_data["data"]
+ assert revoke_token_data["data"]["revokeAccessToken"]
+ assert revoke_token_data["data"]["revokeAccessToken"] is True
+
+
+ # Remove the policy
+ remove_policy(policy_urn, admin_session)
+
+
+ # Ensure that user can't create access token after policy is removed
+ _ensure_cant_perform_action(user_session, create_access_token,"createAccessToken")
\ No newline at end of file
From ddb4e1b5ffa01763d7d3353a506d4329faf11e25 Mon Sep 17 00:00:00 2001
From: Davi Arnaut
Date: Fri, 3 Nov 2023 10:26:11 -0700
Subject: [PATCH 05/33] fix(mysql-setup): quote database name (#9169)
---
docker/mysql-setup/init.sql | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/docker/mysql-setup/init.sql b/docker/mysql-setup/init.sql
index b789329ddfd179..b6a1d47fb2a022 100644
--- a/docker/mysql-setup/init.sql
+++ b/docker/mysql-setup/init.sql
@@ -1,6 +1,6 @@
-- create datahub database
-CREATE DATABASE IF NOT EXISTS DATAHUB_DB_NAME CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
-USE DATAHUB_DB_NAME;
+CREATE DATABASE IF NOT EXISTS `DATAHUB_DB_NAME` CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
+USE `DATAHUB_DB_NAME`;
-- create metadata aspect table
create table if not exists metadata_aspect_v2 (
From c2bc41d15eed31f89076913f641298ded5219a4f Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Fri, 3 Nov 2023 12:29:31 -0500
Subject: [PATCH 06/33] fix(health): fix health check url authentication
(#9117)
---
.../authentication/AuthenticationRequest.java | 12 ++++
.../filter/AuthenticationFilter.java | 13 ++++-
.../HealthStatusAuthenticator.java | 55 +++++++++++++++++++
.../src/main/resources/application.yml | 2 +
metadata-service/health-servlet/build.gradle | 22 --------
.../openapi/config/SpringWebConfig.java | 2 -
.../health}/HealthCheckController.java | 30 ++++++----
metadata-service/war/build.gradle | 1 -
.../webapp/WEB-INF/openapiServlet-servlet.xml | 2 +-
settings.gradle | 1 -
10 files changed, 101 insertions(+), 39 deletions(-)
create mode 100644 metadata-service/auth-impl/src/main/java/com/datahub/authentication/authenticator/HealthStatusAuthenticator.java
delete mode 100644 metadata-service/health-servlet/build.gradle
rename metadata-service/{health-servlet/src/main/java/com/datahub/health/controller => openapi-servlet/src/main/java/io/datahubproject/openapi/health}/HealthCheckController.java (79%)
diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authentication/AuthenticationRequest.java b/metadata-auth/auth-api/src/main/java/com/datahub/authentication/AuthenticationRequest.java
index 91f15f9d5ae61d..5673bac5442b29 100644
--- a/metadata-auth/auth-api/src/main/java/com/datahub/authentication/AuthenticationRequest.java
+++ b/metadata-auth/auth-api/src/main/java/com/datahub/authentication/AuthenticationRequest.java
@@ -1,6 +1,8 @@
package com.datahub.authentication;
import com.datahub.plugins.auth.authentication.Authenticator;
+import lombok.Getter;
+
import java.util.Map;
import java.util.Objects;
import java.util.TreeMap;
@@ -13,14 +15,24 @@
* Currently, this class only hold the inbound request's headers, but could certainly be extended
* to contain additional information like the request parameters, body, ip, etc as needed.
*/
+@Getter
public class AuthenticationRequest {
private final Map caseInsensitiveHeaders;
+ private final String servletInfo;
+ private final String pathInfo;
+
public AuthenticationRequest(@Nonnull final Map requestHeaders) {
+ this("", "", requestHeaders);
+ }
+
+ public AuthenticationRequest(@Nonnull String servletInfo, @Nonnull String pathInfo, @Nonnull final Map requestHeaders) {
Objects.requireNonNull(requestHeaders);
caseInsensitiveHeaders = new TreeMap<>(String.CASE_INSENSITIVE_ORDER);
caseInsensitiveHeaders.putAll(requestHeaders);
+ this.servletInfo = servletInfo;
+ this.pathInfo = pathInfo;
}
/**
diff --git a/metadata-service/auth-filter/src/main/java/com/datahub/auth/authentication/filter/AuthenticationFilter.java b/metadata-service/auth-filter/src/main/java/com/datahub/auth/authentication/filter/AuthenticationFilter.java
index e15918a8131580..8c7b3ac8b98f04 100644
--- a/metadata-service/auth-filter/src/main/java/com/datahub/auth/authentication/filter/AuthenticationFilter.java
+++ b/metadata-service/auth-filter/src/main/java/com/datahub/auth/authentication/filter/AuthenticationFilter.java
@@ -2,6 +2,7 @@
import com.datahub.authentication.authenticator.AuthenticatorChain;
import com.datahub.authentication.authenticator.DataHubSystemAuthenticator;
+import com.datahub.authentication.authenticator.HealthStatusAuthenticator;
import com.datahub.authentication.authenticator.NoOpAuthenticator;
import com.datahub.authentication.token.StatefulTokenService;
import com.datahub.plugins.PluginConstant;
@@ -29,6 +30,7 @@
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
+import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
@@ -148,7 +150,7 @@ private void buildAuthenticatorChain() {
}
private AuthenticationRequest buildAuthContext(HttpServletRequest request) {
- return new AuthenticationRequest(Collections.list(request.getHeaderNames())
+ return new AuthenticationRequest(request.getServletPath(), request.getPathInfo(), Collections.list(request.getHeaderNames())
.stream()
.collect(Collectors.toMap(headerName -> headerName, request::getHeader)));
}
@@ -242,7 +244,14 @@ private void registerNativeAuthenticator(AuthenticatorChain authenticatorChain,
final Authenticator authenticator = clazz.newInstance();
// Successfully created authenticator. Now init and register it.
log.debug(String.format("Initializing Authenticator with name %s", type));
- authenticator.init(configs, authenticatorContext);
+ if (authenticator instanceof HealthStatusAuthenticator) {
+ Map authenticatorConfig = new HashMap<>(Map.of(SYSTEM_CLIENT_ID_CONFIG,
+ this.configurationProvider.getAuthentication().getSystemClientId()));
+ authenticatorConfig.putAll(Optional.ofNullable(internalAuthenticatorConfig.getConfigs()).orElse(Collections.emptyMap()));
+ authenticator.init(authenticatorConfig, authenticatorContext);
+ } else {
+ authenticator.init(configs, authenticatorContext);
+ }
log.info(String.format("Registering Authenticator with name %s", type));
authenticatorChain.register(authenticator);
} catch (Exception e) {
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/authenticator/HealthStatusAuthenticator.java b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/authenticator/HealthStatusAuthenticator.java
new file mode 100644
index 00000000000000..5749eacf5d25d6
--- /dev/null
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/authenticator/HealthStatusAuthenticator.java
@@ -0,0 +1,55 @@
+package com.datahub.authentication.authenticator;
+
+import com.datahub.authentication.Actor;
+import com.datahub.authentication.ActorType;
+import com.datahub.authentication.Authentication;
+import com.datahub.authentication.AuthenticationException;
+import com.datahub.authentication.AuthenticationRequest;
+import com.datahub.authentication.AuthenticatorContext;
+import com.datahub.plugins.auth.authentication.Authenticator;
+import lombok.extern.slf4j.Slf4j;
+
+import javax.annotation.Nonnull;
+import javax.annotation.Nullable;
+import java.util.Collections;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+
+import static com.datahub.authentication.AuthenticationConstants.SYSTEM_CLIENT_ID_CONFIG;
+
+
+/**
+ * This Authenticator is used for allowing access for unauthenticated health check endpoints
+ *
+ * It exists to support load balancers, liveness/readiness checks
+ *
+ */
+@Slf4j
+public class HealthStatusAuthenticator implements Authenticator {
+ private static final Set HEALTH_ENDPOINTS = Set.of(
+ "/openapi/check/",
+ "/openapi/up/"
+ );
+ private String systemClientId;
+
+ @Override
+ public void init(@Nonnull final Map config, @Nullable final AuthenticatorContext context) {
+ Objects.requireNonNull(config, "Config parameter cannot be null");
+ this.systemClientId = Objects.requireNonNull((String) config.get(SYSTEM_CLIENT_ID_CONFIG),
+ String.format("Missing required config %s", SYSTEM_CLIENT_ID_CONFIG));
+ }
+
+ @Override
+ public Authentication authenticate(@Nonnull AuthenticationRequest context) throws AuthenticationException {
+ Objects.requireNonNull(context);
+ if (HEALTH_ENDPOINTS.stream().anyMatch(prefix -> String.join("", context.getServletInfo(), context.getPathInfo()).startsWith(prefix))) {
+ return new Authentication(
+ new Actor(ActorType.USER, systemClientId),
+ "",
+ Collections.emptyMap()
+ );
+ }
+ throw new AuthenticationException("Authorization not allowed. Non-health check endpoint.");
+ }
+}
diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml
index b817208672e08b..91b10a75c922e2 100644
--- a/metadata-service/configuration/src/main/resources/application.yml
+++ b/metadata-service/configuration/src/main/resources/application.yml
@@ -11,6 +11,8 @@ authentication:
# Key used to validate incoming tokens. Should typically be the same as authentication.tokenService.signingKey
signingKey: ${DATAHUB_TOKEN_SERVICE_SIGNING_KEY:WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94=}
salt: ${DATAHUB_TOKEN_SERVICE_SALT:ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O=}
+ # Required for unauthenticated health check endpoints - best not to remove.
+ - type: com.datahub.authentication.authenticator.HealthStatusAuthenticator
# Normally failures are only warnings, enable this to throw them.
logAuthenticatorExceptions: ${METADATA_SERVICE_AUTHENTICATOR_EXCEPTIONS_ENABLED:false}
diff --git a/metadata-service/health-servlet/build.gradle b/metadata-service/health-servlet/build.gradle
deleted file mode 100644
index 6095f724b3cd44..00000000000000
--- a/metadata-service/health-servlet/build.gradle
+++ /dev/null
@@ -1,22 +0,0 @@
-apply plugin: 'java'
-
-dependencies {
-
- implementation project(':metadata-service:factories')
-
- implementation externalDependency.guava
- implementation externalDependency.reflections
- implementation externalDependency.springBoot
- implementation externalDependency.springCore
- implementation externalDependency.springDocUI
- implementation externalDependency.springWeb
- implementation externalDependency.springWebMVC
- implementation externalDependency.springBeans
- implementation externalDependency.springContext
- implementation externalDependency.slf4jApi
- compileOnly externalDependency.lombok
- implementation externalDependency.antlr4Runtime
- implementation externalDependency.antlr4
-
- annotationProcessor externalDependency.lombok
-}
\ No newline at end of file
diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java
index 71e8c79a2275a5..e4f49df90c3921 100644
--- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java
+++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java
@@ -44,7 +44,6 @@ public GroupedOpenApi defaultOpenApiGroup() {
.group("default")
.packagesToExclude(
"io.datahubproject.openapi.operations",
- "com.datahub.health",
"io.datahubproject.openapi.health"
).build();
}
@@ -55,7 +54,6 @@ public GroupedOpenApi operationsOpenApiGroup() {
.group("operations")
.packagesToScan(
"io.datahubproject.openapi.operations",
- "com.datahub.health",
"io.datahubproject.openapi.health"
).build();
}
diff --git a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/health/HealthCheckController.java
similarity index 79%
rename from metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java
rename to metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/health/HealthCheckController.java
index c200e63e0d4977..c90603bf88c31e 100644
--- a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java
+++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/health/HealthCheckController.java
@@ -1,5 +1,6 @@
-package com.datahub.health.controller;
+package io.datahubproject.openapi.health;
+import com.google.common.base.Supplier;
import com.google.common.base.Suppliers;
import com.linkedin.gms.factory.config.ConfigurationProvider;
import io.swagger.v3.oas.annotations.tags.Tag;
@@ -9,7 +10,6 @@
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
-import java.util.function.Supplier;
import org.opensearch.action.admin.cluster.health.ClusterHealthRequest;
import org.opensearch.action.admin.cluster.health.ClusterHealthResponse;
@@ -27,7 +27,7 @@
@RestController
-@RequestMapping("/check")
+@RequestMapping("/")
@Tag(name = "HealthCheck", description = "An API for checking health of GMS and its clients.")
public class HealthCheckController {
@Autowired
@@ -41,6 +41,12 @@ public HealthCheckController(ConfigurationProvider config) {
this::getElasticHealth, config.getHealthCheck().getCacheDurationSeconds(), TimeUnit.SECONDS);
}
+ @GetMapping(path = "/check/ready", produces = MediaType.APPLICATION_JSON_VALUE)
+ public ResponseEntity getCombinedHealthCheck(String... checks) {
+ return ResponseEntity.status(getCombinedDebug(checks).getStatusCode())
+ .body(getCombinedDebug(checks).getStatusCode().is2xxSuccessful());
+ }
+
/**
* Combined health check endpoint for checking GMS clients.
* For now, just checks the health of the ElasticSearch client
@@ -48,11 +54,10 @@ public HealthCheckController(ConfigurationProvider config) {
* that component). The status code will be 200 if all components are okay, and 500 if one or more components are not
* healthy.
*/
- @GetMapping(path = "/ready", produces = MediaType.APPLICATION_JSON_VALUE)
- public ResponseEntity
-
:::note
Inline markdown or code snippets are not yet supported for field level documentation.
:::
-
### 2. Set up the reporter
The reporter interface enables the source to report statistics, warnings, failures, and other information about the run.
@@ -71,6 +70,8 @@ some [convenience methods](./src/datahub/emitter/mce_builder.py) for commonly us
### 4. Set up the dependencies
+Note: Steps 4-8 are only required if you intend to contribute the source back to the Datahub project.
+
Declare the source's pip dependencies in the `plugins` variable of the [setup script](./setup.py).
### 5. Enable discoverability
@@ -119,37 +120,38 @@ from datahub.ingestion.api.decorators import (
@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default")
class FileSource(Source):
"""
-
- The File Source can be used to produce all kinds of metadata from a generic metadata events file.
+
+ The File Source can be used to produce all kinds of metadata from a generic metadata events file.
:::note
Events in this file can be in MCE form or MCP form.
:::
-
+
"""
... source code goes here
```
-
#### 7.2 Write custom documentation
-- Create a copy of [`source-docs-template.md`](./source-docs-template.md) and edit all relevant components.
+- Create a copy of [`source-docs-template.md`](./source-docs-template.md) and edit all relevant components.
- Name the document as `` and move it to `metadata-ingestion/docs/sources//.md`. For example for the Kafka platform, under the `kafka` plugin, move the document to `metadata-ingestion/docs/sources/kafka/kafka.md`.
- Add a quickstart recipe corresponding to the plugin under `metadata-ingestion/docs/sources//_recipe.yml`. For example, for the Kafka platform, under the `kafka` plugin, there is a quickstart recipe located at `metadata-ingestion/docs/sources/kafka/kafka_recipe.yml`.
- To write platform-specific documentation (that is cross-plugin), write the documentation under `metadata-ingestion/docs/sources//README.md`. For example, cross-plugin documentation for the BigQuery platform is located under `metadata-ingestion/docs/sources/bigquery/README.md`.
#### 7.3 Viewing the Documentation
-Documentation for the source can be viewed by running the documentation generator from the `docs-website` module.
+Documentation for the source can be viewed by running the documentation generator from the `docs-website` module.
##### Step 1: Build the Ingestion docs
+
```console
# From the root of DataHub repo
./gradlew :metadata-ingestion:docGen
```
If this finishes successfully, you will see output messages like:
+
```console
Ingestion Documentation Generation Complete
############################################
@@ -170,7 +172,8 @@ Ingestion Documentation Generation Complete
You can also find documentation files generated at `./docs/generated/ingestion/sources` relative to the root of the DataHub repo. You should be able to locate your specific source's markdown file here and investigate it to make sure things look as expected.
#### Step 2: Build the Entire Documentation
-To view how this documentation looks in the browser, there is one more step. Just build the entire docusaurus page from the `docs-website` module.
+
+To view how this documentation looks in the browser, there is one more step. Just build the entire docusaurus page from the `docs-website` module.
```console
# From the root of DataHub repo
@@ -178,6 +181,7 @@ To view how this documentation looks in the browser, there is one more step. Jus
```
This will generate messages like:
+
```console
...
> Task :docs-website:yarnGenerate
@@ -219,15 +223,15 @@ BUILD SUCCESSFUL in 35s
36 actionable tasks: 16 executed, 20 up-to-date
```
-After this you need to run the following script from the `docs-website` module.
+After this you need to run the following script from the `docs-website` module.
+
```console
cd docs-website
npm run serve
```
-Now, browse to http://localhost:3000 or whichever port npm is running on, to browse the docs.
-Your source should show up on the left sidebar under `Metadata Ingestion / Sources`.
-
+Now, browse to http://localhost:3000 or whichever port npm is running on, to browse the docs.
+Your source should show up on the left sidebar under `Metadata Ingestion / Sources`.
### 8. Add SQL Alchemy mapping (if applicable)
From 4a4c29030c0cfd2da9eab01798bc74a94fbb8c1d Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Mon, 6 Nov 2023 12:47:24 -0800
Subject: [PATCH 13/33] chore: stop ingestion-smoke CI errors on forks (#9160)
---
.github/workflows/docker-ingestion-smoke.yml | 1 +
1 file changed, 1 insertion(+)
diff --git a/.github/workflows/docker-ingestion-smoke.yml b/.github/workflows/docker-ingestion-smoke.yml
index 8d52c237928577..82b57d23609a56 100644
--- a/.github/workflows/docker-ingestion-smoke.yml
+++ b/.github/workflows/docker-ingestion-smoke.yml
@@ -47,6 +47,7 @@ jobs:
name: Build and Push Docker Image to Docker Hub
runs-on: ubuntu-latest
needs: setup
+ if: ${{ needs.setup.outputs.publish == 'true' }}
steps:
- name: Check out the repo
uses: actions/checkout@v3
From 86d2b08d2bbecc90e9adffd250c894abe54667e7 Mon Sep 17 00:00:00 2001
From: Harshal Sheth
Date: Mon, 6 Nov 2023 12:58:07 -0800
Subject: [PATCH 14/33] docs(ingest): inherit capabilities from superclasses
(#9174)
---
metadata-ingestion-modules/airflow-plugin/setup.py | 4 ++++
.../src/datahub/ingestion/api/decorators.py | 12 +++++++++++-
.../source/state/stateful_ingestion_base.py | 8 +++++++-
3 files changed, 22 insertions(+), 2 deletions(-)
diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py
index a5af881022d8c9..e88fc870cb3331 100644
--- a/metadata-ingestion-modules/airflow-plugin/setup.py
+++ b/metadata-ingestion-modules/airflow-plugin/setup.py
@@ -101,6 +101,10 @@ def get_long_description():
f"acryl-datahub[testing-utils]{_self_pin}",
# Extra requirements for loading our test dags.
"apache-airflow[snowflake]>=2.0.2",
+ # Connexion's new version breaks Airflow:
+ # See https://github.com/apache/airflow/issues/35234.
+ # TODO: We should transition to using Airflow's constraints file.
+ "connexion<3",
# https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350
# Eventually we want to set this to "snowflake-sqlalchemy>=1.4.3".
# However, that doesn't work with older versions of Airflow. Instead
diff --git a/metadata-ingestion/src/datahub/ingestion/api/decorators.py b/metadata-ingestion/src/datahub/ingestion/api/decorators.py
index 5e4427047104fe..b390ffb9dd0362 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/decorators.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/decorators.py
@@ -93,10 +93,20 @@ def capability(
"""
def wrapper(cls: Type) -> Type:
- if not hasattr(cls, "__capabilities"):
+ if not hasattr(cls, "__capabilities") or any(
+ # It's from this class and not a superclass.
+ cls.__capabilities is getattr(base, "__capabilities", None)
+ for base in cls.__bases__
+ ):
cls.__capabilities = {}
cls.get_capabilities = lambda: cls.__capabilities.values()
+ # If the superclasses have capability annotations, copy those over.
+ for base in cls.__bases__:
+ base_caps = getattr(base, "__capabilities", None)
+ if base_caps:
+ cls.__capabilities.update(base_caps)
+
cls.__capabilities[capability_name] = CapabilitySetting(
capability=capability_name, description=description, supported=supported
)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
index 7fb2cf9813cab1..d11b1f9ad6a537 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py
@@ -15,11 +15,12 @@
from datahub.configuration.time_window_config import BaseTimeWindowConfig
from datahub.configuration.validate_field_rename import pydantic_renamed_field
from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.decorators import capability
from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import (
IngestionCheckpointingProviderBase,
JobId,
)
-from datahub.ingestion.api.source import Source, SourceReport
+from datahub.ingestion.api.source import Source, SourceCapability, SourceReport
from datahub.ingestion.source.state.checkpoint import Checkpoint, StateType
from datahub.ingestion.source.state.use_case_handler import (
StatefulIngestionUsecaseHandlerBase,
@@ -177,6 +178,11 @@ class StatefulIngestionReport(SourceReport):
pass
+@capability(
+ SourceCapability.DELETION_DETECTION,
+ "Optionally enabled via `stateful_ingestion.remove_stale_metadata`",
+ supported=True,
+)
class StatefulIngestionSourceBase(Source):
"""
Defines the base class for all stateful sources.
From 2c58c63780970606e50ba95b382dc9ffbde17bfc Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Mon, 6 Nov 2023 15:58:57 -0500
Subject: [PATCH 15/33] fix(ingest/datahub-source): Order by version in memory
(#9185)
---
.../source/datahub/datahub_database_reader.py | 100 ++++++++++++++----
.../tests/unit/test_datahub_source.py | 51 +++++++++
2 files changed, 133 insertions(+), 18 deletions(-)
create mode 100644 metadata-ingestion/tests/unit/test_datahub_source.py
diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
index 96184d8d445e4e..e4f1bb275487ea 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py
@@ -1,9 +1,11 @@
import json
import logging
from datetime import datetime
-from typing import Dict, Iterable, Optional, Tuple
+from typing import Any, Generic, Iterable, List, Optional, Tuple, TypeVar
from sqlalchemy import create_engine
+from sqlalchemy.engine import Row
+from typing_extensions import Protocol
from datahub.emitter.aspect import ASPECT_MAP
from datahub.emitter.mcp import MetadataChangeProposalWrapper
@@ -20,6 +22,62 @@
DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f"
+class VersionOrderable(Protocol):
+ createdon: Any # Should restrict to only orderable types
+ version: int
+
+
+ROW = TypeVar("ROW", bound=VersionOrderable)
+
+
+class VersionOrderer(Generic[ROW]):
+ """Orders rows by (createdon, version == 0).
+
+ That is, orders rows first by createdon, and for equal timestamps, puts version 0 rows last.
+ """
+
+ def __init__(self, enabled: bool):
+ # Stores all version 0 aspects for a given createdon timestamp
+ # Once we have emitted all aspects for a given timestamp, we can emit the version 0 aspects
+ # Guaranteeing that, for a given timestamp, we always ingest version 0 aspects last
+ self.queue: Optional[Tuple[datetime, List[ROW]]] = None
+ self.enabled = enabled
+
+ def __call__(self, rows: Iterable[ROW]) -> Iterable[ROW]:
+ for row in rows:
+ yield from self._process_row(row)
+ yield from self._flush_queue()
+
+ def _process_row(self, row: ROW) -> Iterable[ROW]:
+ if not self.enabled:
+ yield row
+ return
+
+ yield from self._attempt_queue_flush(row)
+ if row.version == 0:
+ self._add_to_queue(row)
+ else:
+ yield row
+
+ def _add_to_queue(self, row: ROW) -> None:
+ if self.queue is None:
+ self.queue = (row.createdon, [row])
+ else:
+ self.queue[1].append(row)
+
+ def _attempt_queue_flush(self, row: ROW) -> Iterable[ROW]:
+ if self.queue is None:
+ return
+
+ if row.createdon > self.queue[0]:
+ yield from self._flush_queue()
+
+ def _flush_queue(self) -> Iterable[ROW]:
+ if self.queue is not None:
+ yield from self.queue[1]
+ self.queue = None
+
+
class DataHubDatabaseReader:
def __init__(
self,
@@ -40,13 +98,14 @@ def query(self) -> str:
# Offset is generally 0, unless we repeat the same createdon twice
# Ensures stable order, chronological per (urn, aspect)
- # Version 0 last, only when createdon is the same. Otherwise relies on createdon order
+ # Relies on createdon order to reflect version order
+ # Ordering of entries with the same createdon is handled by VersionOrderer
return f"""
- SELECT urn, aspect, metadata, systemmetadata, createdon
+ SELECT urn, aspect, metadata, systemmetadata, createdon, version
FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)}
WHERE createdon >= %(since_createdon)s
{"" if self.config.include_all_versions else "AND version = 0"}
- ORDER BY createdon, urn, aspect, CASE WHEN version = 0 THEN 1 ELSE 0 END, version
+ ORDER BY createdon, urn, aspect, version
LIMIT %(limit)s
OFFSET %(offset)s
"""
@@ -54,6 +113,14 @@ def query(self) -> str:
def get_aspects(
self, from_createdon: datetime, stop_time: datetime
) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]:
+ orderer = VersionOrderer[Row](enabled=self.config.include_all_versions)
+ rows = self._get_rows(from_createdon=from_createdon, stop_time=stop_time)
+ for row in orderer(rows):
+ mcp = self._parse_row(row)
+ if mcp:
+ yield mcp, row.createdon
+
+ def _get_rows(self, from_createdon: datetime, stop_time: datetime) -> Iterable[Row]:
with self.engine.connect() as conn:
ts = from_createdon
offset = 0
@@ -69,34 +136,31 @@ def get_aspects(
return
for i, row in enumerate(rows):
- row_dict = row._asdict()
- mcp = self._parse_row(row_dict)
- if mcp:
- yield mcp, row_dict["createdon"]
+ yield row
- if ts == row_dict["createdon"]:
- offset += i
+ if ts == row.createdon:
+ offset += i + 1
else:
- ts = row_dict["createdon"]
+ ts = row.createdon
offset = 0
- def _parse_row(self, d: Dict) -> Optional[MetadataChangeProposalWrapper]:
+ def _parse_row(self, row: Row) -> Optional[MetadataChangeProposalWrapper]:
try:
- json_aspect = post_json_transform(json.loads(d["metadata"]))
- json_metadata = post_json_transform(json.loads(d["systemmetadata"] or "{}"))
+ json_aspect = post_json_transform(json.loads(row.metadata))
+ json_metadata = post_json_transform(json.loads(row.systemmetadata or "{}"))
system_metadata = SystemMetadataClass.from_obj(json_metadata)
return MetadataChangeProposalWrapper(
- entityUrn=d["urn"],
- aspect=ASPECT_MAP[d["aspect"]].from_obj(json_aspect),
+ entityUrn=row.urn,
+ aspect=ASPECT_MAP[row.aspect].from_obj(json_aspect),
systemMetadata=system_metadata,
changeType=ChangeTypeClass.UPSERT,
)
except Exception as e:
logger.warning(
- f"Failed to parse metadata for {d['urn']}: {e}", exc_info=True
+ f"Failed to parse metadata for {row.urn}: {e}", exc_info=True
)
self.report.num_database_parse_errors += 1
self.report.database_parse_errors.setdefault(
str(e), LossyDict()
- ).setdefault(d["aspect"], LossyList()).append(d["urn"])
+ ).setdefault(row.aspect, LossyList()).append(row.urn)
return None
diff --git a/metadata-ingestion/tests/unit/test_datahub_source.py b/metadata-ingestion/tests/unit/test_datahub_source.py
new file mode 100644
index 00000000000000..adc131362b326b
--- /dev/null
+++ b/metadata-ingestion/tests/unit/test_datahub_source.py
@@ -0,0 +1,51 @@
+from dataclasses import dataclass
+
+import pytest
+
+from datahub.ingestion.source.datahub.datahub_database_reader import (
+ VersionOrderable,
+ VersionOrderer,
+)
+
+
+@dataclass
+class MockRow(VersionOrderable):
+ createdon: int
+ version: int
+ urn: str
+
+
+@pytest.fixture
+def rows():
+ return [
+ MockRow(0, 0, "one"),
+ MockRow(0, 1, "one"),
+ MockRow(0, 0, "two"),
+ MockRow(0, 0, "three"),
+ MockRow(0, 1, "three"),
+ MockRow(0, 2, "three"),
+ MockRow(0, 1, "two"),
+ MockRow(0, 4, "three"),
+ MockRow(0, 5, "three"),
+ MockRow(1, 6, "three"),
+ MockRow(1, 0, "four"),
+ MockRow(2, 0, "five"),
+ MockRow(2, 1, "six"),
+ MockRow(2, 0, "six"),
+ MockRow(3, 0, "seven"),
+ MockRow(3, 0, "eight"),
+ ]
+
+
+def test_version_orderer(rows):
+ orderer = VersionOrderer[MockRow](enabled=True)
+ ordered_rows = list(orderer(rows))
+ assert ordered_rows == sorted(
+ ordered_rows, key=lambda x: (x.createdon, x.version == 0)
+ )
+
+
+def test_version_orderer_disabled(rows):
+ orderer = VersionOrderer[MockRow](enabled=False)
+ ordered_rows = list(orderer(rows))
+ assert ordered_rows == rows
From f2ce3ab62cc29bd0d4d4cade2577a50a39fa0f32 Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Mon, 6 Nov 2023 15:19:55 -0600
Subject: [PATCH 16/33] lint(frontend): fix HeaderLinks lint error (#9189)
---
.../src/app/shared/admin/HeaderLinks.tsx | 28 +++++++++----------
1 file changed, 14 insertions(+), 14 deletions(-)
diff --git a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx
index 3f46f35889fd18..4a7a4938ea9709 100644
--- a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx
+++ b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx
@@ -105,20 +105,20 @@ export function HeaderLinks(props: Props) {
View and modify your data dictionary
-
+
}
>
From 34aa08b7f38d733adcfe31ca97131e1ea52b49e6 Mon Sep 17 00:00:00 2001
From: John Joyce
Date: Mon, 6 Nov 2023 16:51:05 -0800
Subject: [PATCH 17/33] refactor(ui): Refactor entity page loading indicators
(#9195)
unrelated smoke test failing.
---
.../src/app/entity/EntityPage.tsx | 4 +-
.../containers/profile/EntityProfile.tsx | 3 --
.../profile/header/EntityHeader.tsx | 46 +++++++++++--------
.../header/EntityHeaderLoadingSection.tsx | 29 ++++++++++++
.../src/app/lineage/LineageExplorer.tsx | 7 +--
.../src/app/lineage/LineageLoadingSection.tsx | 27 +++++++++++
6 files changed, 86 insertions(+), 30 deletions(-)
create mode 100644 datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeaderLoadingSection.tsx
create mode 100644 datahub-web-react/src/app/lineage/LineageLoadingSection.tsx
diff --git a/datahub-web-react/src/app/entity/EntityPage.tsx b/datahub-web-react/src/app/entity/EntityPage.tsx
index 09233dbd89f694..916fa417954126 100644
--- a/datahub-web-react/src/app/entity/EntityPage.tsx
+++ b/datahub-web-react/src/app/entity/EntityPage.tsx
@@ -8,7 +8,6 @@ import { useEntityRegistry } from '../useEntityRegistry';
import analytics, { EventType } from '../analytics';
import { decodeUrn } from './shared/utils';
import { useGetGrantedPrivilegesQuery } from '../../graphql/policy.generated';
-import { Message } from '../shared/Message';
import { UnauthorizedPage } from '../authorization/UnauthorizedPage';
import { ErrorSection } from '../shared/error/ErrorSection';
import { VIEW_ENTITY_PAGE } from './shared/constants';
@@ -34,7 +33,7 @@ export const EntityPage = ({ entityType }: Props) => {
const isLineageSupported = entity.isLineageEnabled();
const isLineageMode = useIsLineageMode();
const authenticatedUserUrn = useUserContext()?.user?.urn;
- const { loading, error, data } = useGetGrantedPrivilegesQuery({
+ const { error, data } = useGetGrantedPrivilegesQuery({
variables: {
input: {
actorUrn: authenticatedUserUrn as string,
@@ -71,7 +70,6 @@ export const EntityPage = ({ entityType }: Props) => {
return (
<>
- {loading && }
{error && }
{data && !canViewEntityPage && }
{canViewEntityPage &&
diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx
index 5384eb94429ed4..74c127cb05dd9c 100644
--- a/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx
+++ b/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx
@@ -4,7 +4,6 @@ import { MutationHookOptions, MutationTuple, QueryHookOptions, QueryResult } fro
import styled from 'styled-components/macro';
import { useHistory } from 'react-router';
import { EntityType, Exact } from '../../../../../types.generated';
-import { Message } from '../../../../shared/Message';
import {
getEntityPath,
getOnboardingStepIdsForEntityType,
@@ -274,7 +273,6 @@ export const EntityProfile = ({
}}
>
<>
- {loading && }
{(error && ) ||
(!loading && (
@@ -323,7 +321,6 @@ export const EntityProfile = ({
banner
/>
)}
- {loading && }
{(error && ) || (
{isLineageMode ? (
diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx
index 97595a515b34d5..69389f5dcf6fc0 100644
--- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx
+++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx
@@ -16,6 +16,7 @@ import ShareButton from '../../../../../shared/share/ShareButton';
import { capitalizeFirstLetterOnly } from '../../../../../shared/textUtil';
import { useUserContext } from '../../../../../context/useUserContext';
import { useEntityRegistry } from '../../../../../useEntityRegistry';
+import EntityHeaderLoadingSection from './EntityHeaderLoadingSection';
const TitleWrapper = styled.div`
display: flex;
@@ -81,7 +82,7 @@ type Props = {
};
export const EntityHeader = ({ headerDropdownItems, headerActionItems, isNameEditable, subHeader }: Props) => {
- const { urn, entityType, entityData } = useEntityData();
+ const { urn, entityType, entityData, loading } = useEntityData();
const refetch = useRefetch();
const me = useUserContext();
const platformName = getPlatformName(entityData);
@@ -99,25 +100,32 @@ export const EntityHeader = ({ headerDropdownItems, headerActionItems, isNameEdi
<>
-
-
-
- {entityData?.deprecation?.deprecated && (
-
- )}
- {entityData?.health && (
- ) || (
+ <>
+
+
+
+ {entityData?.deprecation?.deprecated && (
+
+ )}
+ {entityData?.health && (
+
+ )}
+
+
- )}
-
-
+ >
+ )}
diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeaderLoadingSection.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeaderLoadingSection.tsx
new file mode 100644
index 00000000000000..bbf813804edd43
--- /dev/null
+++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeaderLoadingSection.tsx
@@ -0,0 +1,29 @@
+import * as React from 'react';
+import { Skeleton, Space } from 'antd';
+import styled from 'styled-components';
+import { ANTD_GRAY } from '../../../constants';
+
+const ContextSkeleton = styled(Skeleton.Input)`
+ && {
+ width: 320px;
+ border-radius: 4px;
+ background-color: ${ANTD_GRAY[3]};
+ }
+`;
+
+const NameSkeleton = styled(Skeleton.Input)`
+ && {
+ width: 240px;
+ border-radius: 4px;
+ background-color: ${ANTD_GRAY[3]};
+ }
+`;
+
+export default function EntityHeaderLoadingSection() {
+ return (
+
+
+
+
+ );
+}
diff --git a/datahub-web-react/src/app/lineage/LineageExplorer.tsx b/datahub-web-react/src/app/lineage/LineageExplorer.tsx
index ed0b26bde11efa..f59d1843b8a99b 100644
--- a/datahub-web-react/src/app/lineage/LineageExplorer.tsx
+++ b/datahub-web-react/src/app/lineage/LineageExplorer.tsx
@@ -3,7 +3,6 @@ import { useHistory } from 'react-router';
import { Button, Drawer } from 'antd';
import { InfoCircleOutlined } from '@ant-design/icons';
import styled from 'styled-components';
-import { Message } from '../shared/Message';
import { useEntityRegistry } from '../useEntityRegistry';
import CompactContext from '../shared/CompactContext';
import { EntityAndType, EntitySelectParams, FetchedEntities } from './types';
@@ -18,12 +17,10 @@ import { ErrorSection } from '../shared/error/ErrorSection';
import usePrevious from '../shared/usePrevious';
import { useGetLineageTimeParams } from './utils/useGetLineageTimeParams';
import analytics, { EventType } from '../analytics';
+import LineageLoadingSection from './LineageLoadingSection';
const DEFAULT_DISTANCE_FROM_TOP = 106;
-const LoadingMessage = styled(Message)`
- margin-top: 10%;
-`;
const FooterButtonGroup = styled.div`
display: flex;
justify-content: space-between;
@@ -167,7 +164,7 @@ export default function LineageExplorer({ urn, type }: Props) {
return (
<>
{error && }
- {loading && }
+ {loading && }
{!!data && (
+
+
+ );
+}
From 279fdd50d7870cc404a58a5c9afbf6b3c7c432ec Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Mon, 6 Nov 2023 19:51:20 -0600
Subject: [PATCH 18/33] fix(security): fix for zookeeper CVE-2023-44981 (#9190)
---
build.gradle | 4 ++--
metadata-service/restli-api/build.gradle | 6 ++++++
2 files changed, 8 insertions(+), 2 deletions(-)
diff --git a/build.gradle b/build.gradle
index bd282535fa13cd..31e005e001cf05 100644
--- a/build.gradle
+++ b/build.gradle
@@ -1,7 +1,7 @@
buildscript {
ext.junitJupiterVersion = '5.6.1'
// Releases: https://github.com/linkedin/rest.li/blob/master/CHANGELOG.md
- ext.pegasusVersion = '29.45.0'
+ ext.pegasusVersion = '29.46.8'
ext.mavenVersion = '3.6.3'
ext.springVersion = '5.3.29'
ext.springBootVersion = '2.7.14'
@@ -212,7 +212,7 @@ project.ext.externalDependency = [
'testContainersOpenSearch': 'org.opensearch:opensearch-testcontainers:2.0.0',
'typesafeConfig':'com.typesafe:config:1.4.1',
'wiremock':'com.github.tomakehurst:wiremock:2.10.0',
- 'zookeeper': 'org.apache.zookeeper:zookeeper:3.4.14',
+ 'zookeeper': 'org.apache.zookeeper:zookeeper:3.7.2',
'wire': 'com.squareup.wire:wire-compiler:3.7.1',
'charle': 'com.charleskorn.kaml:kaml:0.53.0',
'common': 'commons-io:commons-io:2.7',
diff --git a/metadata-service/restli-api/build.gradle b/metadata-service/restli-api/build.gradle
index ed4f4118dba307..f182d11b6baebf 100644
--- a/metadata-service/restli-api/build.gradle
+++ b/metadata-service/restli-api/build.gradle
@@ -8,4 +8,10 @@ dependencies {
restClientCompile spec.product.pegasus.d2
restClientCompile spec.product.pegasus.restliClient
+
+ constraints {
+ restClientCompile(externalDependency.zookeeper) {
+ because("CVE-2023-44981")
+ }
+ }
}
\ No newline at end of file
From ac9a0140570b3ada060ce716304f33ff62a1348a Mon Sep 17 00:00:00 2001
From: John Joyce
Date: Mon, 6 Nov 2023 18:33:02 -0800
Subject: [PATCH 19/33] refactor(ui): Rename "dataset details" button text to
"view details" on lineage sidebar profile (#9196)
---
datahub-web-react/src/app/lineage/LineageExplorer.tsx | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/datahub-web-react/src/app/lineage/LineageExplorer.tsx b/datahub-web-react/src/app/lineage/LineageExplorer.tsx
index f59d1843b8a99b..28cd7025f51f45 100644
--- a/datahub-web-react/src/app/lineage/LineageExplorer.tsx
+++ b/datahub-web-react/src/app/lineage/LineageExplorer.tsx
@@ -217,7 +217,7 @@ export default function LineageExplorer({ urn, type }: Props) {
Close
)
From 45770013c9bdaadfb49950c67a838aef879a8e8a Mon Sep 17 00:00:00 2001
From: John Joyce
Date: Mon, 6 Nov 2023 18:33:13 -0800
Subject: [PATCH 20/33] feat(ui): Add command-k icons to search bar (#9194)
---
.../src/app/home/HomePageHeader.tsx | 1 +
datahub-web-react/src/app/search/CommandK.tsx | 29 +++++++++++++++
.../src/app/search/SearchBar.tsx | 37 ++++++++++++-------
.../src/app/search/SearchHeader.tsx | 1 +
4 files changed, 55 insertions(+), 13 deletions(-)
create mode 100644 datahub-web-react/src/app/search/CommandK.tsx
diff --git a/datahub-web-react/src/app/home/HomePageHeader.tsx b/datahub-web-react/src/app/home/HomePageHeader.tsx
index e5c01252a865b6..0052d54f562ebd 100644
--- a/datahub-web-react/src/app/home/HomePageHeader.tsx
+++ b/datahub-web-react/src/app/home/HomePageHeader.tsx
@@ -276,6 +276,7 @@ export const HomePageHeader = () => {
combineSiblings
showQuickFilters
showViewAllResults
+ showCommandK
/>
{searchResultsToShow && searchResultsToShow.length > 0 && (
diff --git a/datahub-web-react/src/app/search/CommandK.tsx b/datahub-web-react/src/app/search/CommandK.tsx
new file mode 100644
index 00000000000000..13e55a0e3f2661
--- /dev/null
+++ b/datahub-web-react/src/app/search/CommandK.tsx
@@ -0,0 +1,29 @@
+import React from 'react';
+import styled from 'styled-components';
+import { ANTD_GRAY } from '../entity/shared/constants';
+
+const Container = styled.div`
+ color: ${ANTD_GRAY[6]};
+ background-color: #ffffff;
+ opacity: 0.9;
+ border-color: black;
+ border-radius: 6px;
+ border: 1px solid ${ANTD_GRAY[6]};
+ padding-right: 6px;
+ padding-left: 6px;
+ margin-right: 4px;
+ margin-left: 4px;
+`;
+
+const Letter = styled.span`
+ padding: 2px;
+`;
+
+export const CommandK = () => {
+ return (
+
+ ⌘
+ K
+
+ );
+};
diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx
index 5f797e68fe0e8b..a23ead83caf541 100644
--- a/datahub-web-react/src/app/search/SearchBar.tsx
+++ b/datahub-web-react/src/app/search/SearchBar.tsx
@@ -23,6 +23,7 @@ import { navigateToSearchUrl } from './utils/navigateToSearchUrl';
import ViewAllSearchItem from './ViewAllSearchItem';
import { ViewSelect } from '../entity/view/select/ViewSelect';
import { combineSiblingsInAutoComplete } from './utils/combineSiblingsInAutoComplete';
+import { CommandK } from './CommandK';
const StyledAutoComplete = styled(AutoComplete)`
width: 100%;
@@ -114,6 +115,7 @@ interface Props {
fixAutoComplete?: boolean;
hideRecommendations?: boolean;
showQuickFilters?: boolean;
+ showCommandK?: boolean;
viewsEnabled?: boolean;
combineSiblings?: boolean;
setIsSearchBarFocused?: (isSearchBarFocused: boolean) => void;
@@ -142,6 +144,7 @@ export const SearchBar = ({
fixAutoComplete,
hideRecommendations,
showQuickFilters,
+ showCommandK = false,
viewsEnabled = false,
combineSiblings = false,
setIsSearchBarFocused,
@@ -153,6 +156,8 @@ export const SearchBar = ({
const [searchQuery, setSearchQuery] = useState(initialQuery);
const [selected, setSelected] = useState();
const [isDropdownVisible, setIsDropdownVisible] = useState(false);
+ const [isFocused, setIsFocused] = useState(false);
+
useEffect(() => setSelected(initialQuery), [initialQuery]);
const searchEntityTypes = entityRegistry.getSearchEntityTypes();
@@ -277,11 +282,13 @@ export const SearchBar = ({
function handleFocus() {
if (onFocus) onFocus();
handleSearchBarClick(true);
+ setIsFocused(true);
}
function handleBlur() {
if (onBlur) onBlur();
handleSearchBarClick(false);
+ setIsFocused(false);
}
function handleSearch(query: string, type?: EntityType, appliedQuickFilters?: FacetFilterInput[]) {
@@ -294,18 +301,21 @@ export const SearchBar = ({
const searchInputRef = useRef(null);
useEffect(() => {
- const handleKeyDown = (event) => {
- // Support command-k to select the search bar.
- // 75 is the keyCode for 'k'
- if ((event.metaKey || event.ctrlKey) && event.keyCode === 75) {
- (searchInputRef?.current as any)?.focus();
- }
- };
- document.addEventListener('keydown', handleKeyDown);
- return () => {
- document.removeEventListener('keydown', handleKeyDown);
- };
- }, []);
+ if (showCommandK) {
+ const handleKeyDown = (event) => {
+ // Support command-k to select the search bar.
+ // 75 is the keyCode for 'k'
+ if ((event.metaKey || event.ctrlKey) && event.keyCode === 75) {
+ (searchInputRef?.current as any)?.focus();
+ }
+ };
+ document.addEventListener('keydown', handleKeyDown);
+ return () => {
+ document.removeEventListener('keydown', handleKeyDown);
+ };
+ }
+ return () => null;
+ }, [showCommandK]);
return (
@@ -377,7 +387,7 @@ export const SearchBar = ({
data-testid="search-input"
onFocus={handleFocus}
onBlur={handleBlur}
- allowClear={{ clearIcon: }}
+ allowClear={(isFocused && { clearIcon: }) || false}
prefix={
<>
{viewsEnabled && (
@@ -411,6 +421,7 @@ export const SearchBar = ({
>
}
ref={searchInputRef}
+ suffix={(showCommandK && !isFocused && ) || null}
/>
diff --git a/datahub-web-react/src/app/search/SearchHeader.tsx b/datahub-web-react/src/app/search/SearchHeader.tsx
index 91f9753a3d6012..76e78a11d3e9d9 100644
--- a/datahub-web-react/src/app/search/SearchHeader.tsx
+++ b/datahub-web-react/src/app/search/SearchHeader.tsx
@@ -108,6 +108,7 @@ export const SearchHeader = ({
fixAutoComplete
showQuickFilters
showViewAllResults
+ showCommandK
/>
From 88cde08d060041bfb6f585ed7a486f6ba5886733 Mon Sep 17 00:00:00 2001
From: Chris Collins
Date: Mon, 6 Nov 2023 21:34:17 -0500
Subject: [PATCH 21/33] feat(ui): Update Apollo cache to work with union types
(#9193)
---
datahub-web-react/codegen.yml | 3 ++
datahub-web-react/package.json | 1 +
datahub-web-react/src/App.tsx | 3 ++
datahub-web-react/yarn.lock | 73 ++++++++++++++++++++++++++++++++++
4 files changed, 80 insertions(+)
diff --git a/datahub-web-react/codegen.yml b/datahub-web-react/codegen.yml
index 96a2bd61379205..35728e8aeb7d49 100644
--- a/datahub-web-react/codegen.yml
+++ b/datahub-web-react/codegen.yml
@@ -20,6 +20,9 @@ generates:
src/types.generated.ts:
plugins:
- 'typescript'
+ src/possibleTypes.generated.ts:
+ plugins:
+ - 'fragment-matcher'
src/:
preset: near-operation-file
presetConfig:
diff --git a/datahub-web-react/package.json b/datahub-web-react/package.json
index 2d9329919fdc1b..019295f3e6ffeb 100644
--- a/datahub-web-react/package.json
+++ b/datahub-web-react/package.json
@@ -11,6 +11,7 @@
"@apollo/client": "^3.3.19",
"@craco/craco": "^6.1.1",
"@data-ui/xy-chart": "^0.0.84",
+ "@graphql-codegen/fragment-matcher": "^5.0.0",
"@miragejs/graphql": "^0.1.11",
"@monaco-editor/react": "^4.3.1",
"@react-hook/window-size": "^3.0.7",
diff --git a/datahub-web-react/src/App.tsx b/datahub-web-react/src/App.tsx
index b6bc608dccbbb0..342a89f350429f 100644
--- a/datahub-web-react/src/App.tsx
+++ b/datahub-web-react/src/App.tsx
@@ -36,6 +36,7 @@ import { DataPlatformEntity } from './app/entity/dataPlatform/DataPlatformEntity
import { DataProductEntity } from './app/entity/dataProduct/DataProductEntity';
import { DataPlatformInstanceEntity } from './app/entity/dataPlatformInstance/DataPlatformInstanceEntity';
import { RoleEntity } from './app/entity/Access/RoleEntity';
+import possibleTypesResult from './possibleTypes.generated';
/*
Construct Apollo Client
@@ -77,6 +78,8 @@ const client = new ApolloClient({
},
},
},
+ // need to define possibleTypes to allow us to use Apollo cache with union types
+ possibleTypes: possibleTypesResult.possibleTypes,
}),
credentials: 'include',
defaultOptions: {
diff --git a/datahub-web-react/yarn.lock b/datahub-web-react/yarn.lock
index 590f3ebcef8c33..ce0f2f514dad1e 100644
--- a/datahub-web-react/yarn.lock
+++ b/datahub-web-react/yarn.lock
@@ -2298,6 +2298,14 @@
"@graphql-tools/utils" "^6"
tslib "~2.0.1"
+"@graphql-codegen/fragment-matcher@^5.0.0":
+ version "5.0.0"
+ resolved "https://registry.yarnpkg.com/@graphql-codegen/fragment-matcher/-/fragment-matcher-5.0.0.tgz#2a016715e42e8f21aa08830f34a4d0a930e660fe"
+ integrity sha512-mbash9E8eY6RSMSNrrO+C9JJEn8rdr8ORaxMpgdWL2qe2q/TlLUCE3ZvQvHkSc7GjBnMEk36LncA8ApwHR2BHg==
+ dependencies:
+ "@graphql-codegen/plugin-helpers" "^5.0.0"
+ tslib "~2.5.0"
+
"@graphql-codegen/near-operation-file-preset@^1.17.13":
version "1.18.6"
resolved "https://registry.yarnpkg.com/@graphql-codegen/near-operation-file-preset/-/near-operation-file-preset-1.18.6.tgz#2378ac75feaeaa1cfd2146bd84bf839b1fe20d9d"
@@ -2331,6 +2339,18 @@
lodash "~4.17.0"
tslib "~2.3.0"
+"@graphql-codegen/plugin-helpers@^5.0.0":
+ version "5.0.1"
+ resolved "https://registry.yarnpkg.com/@graphql-codegen/plugin-helpers/-/plugin-helpers-5.0.1.tgz#e2429fcfba3f078d5aa18aa062d46c922bbb0d55"
+ integrity sha512-6L5sb9D8wptZhnhLLBcheSPU7Tg//DGWgc5tQBWX46KYTOTQHGqDpv50FxAJJOyFVJrveN9otWk9UT9/yfY4ww==
+ dependencies:
+ "@graphql-tools/utils" "^10.0.0"
+ change-case-all "1.0.15"
+ common-tags "1.8.2"
+ import-from "4.0.0"
+ lodash "~4.17.0"
+ tslib "~2.5.0"
+
"@graphql-codegen/typescript-operations@1.17.13":
version "1.17.13"
resolved "https://registry.yarnpkg.com/@graphql-codegen/typescript-operations/-/typescript-operations-1.17.13.tgz#a5b08c1573b9507ca5a9e66e795aecc40ddc5305"
@@ -2584,6 +2604,16 @@
dependencies:
tslib "^2.4.0"
+"@graphql-tools/utils@^10.0.0":
+ version "10.0.8"
+ resolved "https://registry.yarnpkg.com/@graphql-tools/utils/-/utils-10.0.8.tgz#c7b84275ec83dc42ad9f3d4ffc424ff682075759"
+ integrity sha512-yjyA8ycSa1WRlJqyX/aLqXeE5DvF/H02+zXMUFnCzIDrj0UvLMUrxhmVFnMK0Q2n3bh4uuTeY3621m5za9ovXw==
+ dependencies:
+ "@graphql-typed-document-node/core" "^3.1.1"
+ cross-inspect "1.0.0"
+ dset "^3.1.2"
+ tslib "^2.4.0"
+
"@graphql-tools/utils@^6":
version "6.2.4"
resolved "https://registry.yarnpkg.com/@graphql-tools/utils/-/utils-6.2.4.tgz#38a2314d2e5e229ad4f78cca44e1199e18d55856"
@@ -2618,6 +2648,11 @@
resolved "https://registry.yarnpkg.com/@graphql-typed-document-node/core/-/core-3.1.0.tgz#0eee6373e11418bfe0b5638f654df7a4ca6a3950"
integrity sha512-wYn6r8zVZyQJ6rQaALBEln5B1pzxb9shV5Ef97kTvn6yVGrqyXVnDqnU24MXnFubR+rZjBY9NWuxX3FB2sTsjg==
+"@graphql-typed-document-node/core@^3.1.1":
+ version "3.2.0"
+ resolved "https://registry.yarnpkg.com/@graphql-typed-document-node/core/-/core-3.2.0.tgz#5f3d96ec6b2354ad6d8a28bf216a1d97b5426861"
+ integrity sha512-mB9oAsNCm9aM3/SOv4YtBMqZbYj10R7dkq8byBqxGY/ncFwhf2oQzMV+LCRlWoDSEBJ3COiR1yeDvMtsoOsuFQ==
+
"@hapi/address@2.x.x":
version "2.1.4"
resolved "https://registry.yarnpkg.com/@hapi/address/-/address-2.1.4.tgz#5d67ed43f3fd41a69d4b9ff7b56e7c0d1d0a81e5"
@@ -7001,6 +7036,22 @@ change-case-all@1.0.14:
upper-case "^2.0.2"
upper-case-first "^2.0.2"
+change-case-all@1.0.15:
+ version "1.0.15"
+ resolved "https://registry.yarnpkg.com/change-case-all/-/change-case-all-1.0.15.tgz#de29393167fc101d646cd76b0ef23e27d09756ad"
+ integrity sha512-3+GIFhk3sNuvFAJKU46o26OdzudQlPNBCu1ZQi3cMeMHhty1bhDxu2WrEilVNYaGvqUtR1VSigFcJOiS13dRhQ==
+ dependencies:
+ change-case "^4.1.2"
+ is-lower-case "^2.0.2"
+ is-upper-case "^2.0.2"
+ lower-case "^2.0.2"
+ lower-case-first "^2.0.2"
+ sponge-case "^1.0.1"
+ swap-case "^2.0.2"
+ title-case "^3.0.3"
+ upper-case "^2.0.2"
+ upper-case-first "^2.0.2"
+
change-case@^4.1.2:
version "4.1.2"
resolved "https://registry.yarnpkg.com/change-case/-/change-case-4.1.2.tgz#fedfc5f136045e2398c0410ee441f95704641e12"
@@ -7357,6 +7408,11 @@ common-tags@1.8.0, common-tags@^1.8.0:
resolved "https://registry.yarnpkg.com/common-tags/-/common-tags-1.8.0.tgz#8e3153e542d4a39e9b10554434afaaf98956a937"
integrity sha512-6P6g0uetGpW/sdyUy/iQQCbFF0kWVMSIVSyYz7Zgjcgh8mgw8PQzDNZeyZ5DQ2gM7LBoZPHmnjz8rUthkBG5tw==
+common-tags@1.8.2:
+ version "1.8.2"
+ resolved "https://registry.yarnpkg.com/common-tags/-/common-tags-1.8.2.tgz#94ebb3c076d26032745fd54face7f688ef5ac9c6"
+ integrity sha512-gk/Z852D2Wtb//0I+kRFNKKE9dIIVirjoqPoA1wJU+XePVXZfGeBpk45+A1rKO4Q43prqWBNY/MiIeRLbPWUaA==
+
commondir@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/commondir/-/commondir-1.0.1.tgz#ddd800da0c66127393cca5950ea968a3aaf1253b"
@@ -7698,6 +7754,13 @@ cross-fetch@^3.1.5:
dependencies:
node-fetch "2.6.7"
+cross-inspect@1.0.0:
+ version "1.0.0"
+ resolved "https://registry.yarnpkg.com/cross-inspect/-/cross-inspect-1.0.0.tgz#5fda1af759a148594d2d58394a9e21364f6849af"
+ integrity sha512-4PFfn4b5ZN6FMNGSZlyb7wUhuN8wvj8t/VQHZdM4JsDcruGJ8L2kf9zao98QIrBPFCpdk27qst/AGTl7pL3ypQ==
+ dependencies:
+ tslib "^2.4.0"
+
cross-spawn@7.0.3, cross-spawn@^7.0.0, cross-spawn@^7.0.2, cross-spawn@^7.0.3:
version "7.0.3"
resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6"
@@ -8595,6 +8658,11 @@ dotenv@^8.2.0:
resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-8.6.0.tgz#061af664d19f7f4d8fc6e4ff9b584ce237adcb8b"
integrity sha512-IrPdXQsk2BbzvCBGBOTmmSH5SodmqZNt4ERAZDmW4CT+tL8VtvinqywuANaFu4bOMWki16nqf0e4oC0QIaDr/g==
+dset@^3.1.2:
+ version "3.1.3"
+ resolved "https://registry.yarnpkg.com/dset/-/dset-3.1.3.tgz#c194147f159841148e8e34ca41f638556d9542d2"
+ integrity sha512-20TuZZHCEZ2O71q9/+8BwKwZ0QtD9D8ObhrihJPr+vLLYlSuAU3/zL4cSlgbfeoGHTjCSJBa7NGcrF9/Bx/WJQ==
+
duplexer3@^0.1.4:
version "0.1.4"
resolved "https://registry.yarnpkg.com/duplexer3/-/duplexer3-0.1.4.tgz#ee01dd1cac0ed3cbc7fdbea37dc0a8f1ce002ce2"
@@ -18712,6 +18780,11 @@ tslib@~2.3.0:
resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.3.1.tgz#e8a335add5ceae51aa261d32a490158ef042ef01"
integrity sha512-77EbyPPpMz+FRFRuAFlWMtmgUWGe9UOG2Z25NqCwiIjRhOf5iKGuzSe5P2w1laq+FkRy4p+PCuVkJSGkzTEKVw==
+tslib@~2.5.0:
+ version "2.5.3"
+ resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.5.3.tgz#24944ba2d990940e6e982c4bea147aba80209913"
+ integrity sha512-mSxlJJwl3BMEQCUNnxXBU9jP4JBktcEGhURcPR6VQVlnP0FdDEsIaz0C35dXNGLyRfrATNofF0F5p2KPxQgB+w==
+
tsutils@^3.17.1:
version "3.21.0"
resolved "https://registry.yarnpkg.com/tsutils/-/tsutils-3.21.0.tgz#b48717d394cea6c1e096983eed58e9d61715b623"
From 23c98ecf7a88d11e3b195d457ab42c763818df47 Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Tue, 7 Nov 2023 14:40:48 -0600
Subject: [PATCH 22/33] feat(policy): enable support for 10k+ policies (#9177)
Co-authored-by: Pedro Silva
---
.../policy/ListPoliciesResolver.java | 26 ++----
.../metadata/client/JavaEntityClient.java | 2 +-
.../metadata/search/SearchService.java | 18 ++--
.../authorization/DataHubAuthorizer.java | 21 ++---
.../datahub/authorization/PolicyFetcher.java | 62 +++++++++++---
.../authorization/DataHubAuthorizerTest.java | 82 +++++++++++++------
.../src/main/resources/application.yml | 1 +
.../auth/DataHubAuthorizerFactory.java | 5 +-
.../linkedin/entity/client/EntityClient.java | 2 +-
.../entity/client/RestliEntityClient.java | 7 +-
.../cypress/e2e/settings/managing_groups.js | 2 +-
11 files changed, 153 insertions(+), 75 deletions(-)
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/ListPoliciesResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/ListPoliciesResolver.java
index 516d6fa2d31372..b44da1c2f832c6 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/ListPoliciesResolver.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/ListPoliciesResolver.java
@@ -40,23 +40,15 @@ public CompletableFuture get(final DataFetchingEnvironment e
final Integer count = input.getCount() == null ? DEFAULT_COUNT : input.getCount();
final String query = input.getQuery() == null ? DEFAULT_QUERY : input.getQuery();
- return CompletableFuture.supplyAsync(() -> {
- try {
- // First, get all policy Urns.
- final PolicyFetcher.PolicyFetchResult policyFetchResult =
- _policyFetcher.fetchPolicies(start, count, query, context.getAuthentication());
-
- // Now that we have entities we can bind this to a result.
- final ListPoliciesResult result = new ListPoliciesResult();
- result.setStart(start);
- result.setCount(count);
- result.setTotal(policyFetchResult.getTotal());
- result.setPolicies(mapEntities(policyFetchResult.getPolicies()));
- return result;
- } catch (Exception e) {
- throw new RuntimeException("Failed to list policies", e);
- }
- });
+ return _policyFetcher.fetchPolicies(start, query, count, context.getAuthentication())
+ .thenApply(policyFetchResult -> {
+ final ListPoliciesResult result = new ListPoliciesResult();
+ result.setStart(start);
+ result.setCount(count);
+ result.setTotal(policyFetchResult.getTotal());
+ result.setPolicies(mapEntities(policyFetchResult.getPolicies()));
+ return result;
+ });
}
throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator.");
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
index a69c6008fea474..dff9a22de8efd8 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java
@@ -381,7 +381,7 @@ public SearchResult searchAcrossEntities(
@Nonnull
@Override
public ScrollResult scrollAcrossEntities(@Nonnull List entities, @Nonnull String input,
- @Nullable Filter filter, @Nullable String scrollId, @Nonnull String keepAlive, int count,
+ @Nullable Filter filter, @Nullable String scrollId, @Nullable String keepAlive, int count,
@Nullable SearchFlags searchFlags, @Nonnull Authentication authentication)
throws RemoteInvocationException {
final SearchFlags finalFlags = searchFlags != null ? searchFlags : new SearchFlags().setFulltext(true);
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java
index 94b8d57efcc160..c99e4a94feb291 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java
@@ -147,15 +147,23 @@ public SearchResult searchAcrossEntities(@Nonnull List entities, @Nonnul
return result;
}
+ /**
+ * If no entities are provided, fallback to the list of non-empty entities
+ * @param inputEntities the requested entities
+ * @return some entities to search
+ */
private List getEntitiesToSearch(@Nonnull List inputEntities) {
List nonEmptyEntities;
List lowercaseEntities = inputEntities.stream().map(String::toLowerCase).collect(Collectors.toList());
- try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getNonEmptyEntities").time()) {
- nonEmptyEntities = _entityDocCountCache.getNonEmptyEntities();
- }
- if (!inputEntities.isEmpty()) {
- nonEmptyEntities = nonEmptyEntities.stream().filter(lowercaseEntities::contains).collect(Collectors.toList());
+
+ if (lowercaseEntities.isEmpty()) {
+ try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getNonEmptyEntities").time()) {
+ nonEmptyEntities = _entityDocCountCache.getNonEmptyEntities();
+ }
+ } else {
+ nonEmptyEntities = lowercaseEntities;
}
+
return nonEmptyEntities;
}
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
index f8b28f6c182a72..f8f99475de23e2 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
@@ -72,11 +72,13 @@ public DataHubAuthorizer(
final EntityClient entityClient,
final int delayIntervalSeconds,
final int refreshIntervalSeconds,
- final AuthorizationMode mode) {
+ final AuthorizationMode mode,
+ final int policyFetchSize) {
_systemAuthentication = Objects.requireNonNull(systemAuthentication);
_mode = Objects.requireNonNull(mode);
_policyEngine = new PolicyEngine(systemAuthentication, Objects.requireNonNull(entityClient));
- _policyRefreshRunnable = new PolicyRefreshRunnable(systemAuthentication, new PolicyFetcher(entityClient), _policyCache, readWriteLock.writeLock());
+ _policyRefreshRunnable = new PolicyRefreshRunnable(systemAuthentication, new PolicyFetcher(entityClient), _policyCache,
+ readWriteLock.writeLock(), policyFetchSize);
_refreshExecutorService.scheduleAtFixedRate(_policyRefreshRunnable, delayIntervalSeconds, refreshIntervalSeconds, TimeUnit.SECONDS);
}
@@ -244,29 +246,28 @@ static class PolicyRefreshRunnable implements Runnable {
private final PolicyFetcher _policyFetcher;
private final Map> _policyCache;
private final Lock writeLock;
+ private final int count;
@Override
public void run() {
try {
// Populate new cache and swap.
Map> newCache = new HashMap<>();
+ Integer total = null;
+ String scrollId = null;
- int start = 0;
- int count = 30;
- int total = 30;
-
- while (start < total) {
+ while (total == null || scrollId != null) {
try {
final PolicyFetcher.PolicyFetchResult
- policyFetchResult = _policyFetcher.fetchPolicies(start, count, _systemAuthentication);
+ policyFetchResult = _policyFetcher.fetchPolicies(count, scrollId, _systemAuthentication);
addPoliciesToCache(newCache, policyFetchResult.getPolicies());
total = policyFetchResult.getTotal();
- start = start + count;
+ scrollId = policyFetchResult.getScrollId();
} catch (Exception e) {
log.error(
- "Failed to retrieve policy urns! Skipping updating policy cache until next refresh. start: {}, count: {}", start, count, e);
+ "Failed to retrieve policy urns! Skipping updating policy cache until next refresh. count: {}, scrollId: {}", count, scrollId, e);
return;
}
}
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java
index 92d12bad41c9f5..c06da4d245f917 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java
@@ -8,8 +8,8 @@
import com.linkedin.metadata.query.SearchFlags;
import com.linkedin.metadata.query.filter.SortCriterion;
import com.linkedin.metadata.query.filter.SortOrder;
+import com.linkedin.metadata.search.ScrollResult;
import com.linkedin.metadata.search.SearchEntity;
-import com.linkedin.metadata.search.SearchResult;
import com.linkedin.policy.DataHubPolicyInfo;
import com.linkedin.r2.RemoteInvocationException;
import java.net.URISyntaxException;
@@ -18,11 +18,14 @@
import java.util.List;
import java.util.Map;
import java.util.Objects;
+import java.util.concurrent.CompletableFuture;
import java.util.stream.Collectors;
import lombok.RequiredArgsConstructor;
import lombok.Value;
import lombok.extern.slf4j.Slf4j;
+import javax.annotation.Nullable;
+
import static com.linkedin.metadata.Constants.DATAHUB_POLICY_INFO_ASPECT_NAME;
import static com.linkedin.metadata.Constants.POLICY_ENTITY_NAME;
@@ -38,22 +41,53 @@ public class PolicyFetcher {
private static final SortCriterion POLICY_SORT_CRITERION =
new SortCriterion().setField("lastUpdatedTimestamp").setOrder(SortOrder.DESCENDING);
- public PolicyFetchResult fetchPolicies(int start, int count, Authentication authentication)
- throws RemoteInvocationException, URISyntaxException {
- return fetchPolicies(start, count, "", authentication);
+ /**
+ * This is to provide a scroll implementation using the start/count api. It is not efficient
+ * and the scroll native functions should be used instead. This does fix a failure to fetch
+ * policies when deep pagination happens where there are >10k policies.
+ * Exists primarily to prevent breaking change to the graphql api.
+ */
+ @Deprecated
+ public CompletableFuture fetchPolicies(int start, String query, int count, Authentication authentication) {
+ return CompletableFuture.supplyAsync(() -> {
+ try {
+ PolicyFetchResult result = PolicyFetchResult.EMPTY;
+ String scrollId = "";
+ int fetchedResults = 0;
+
+ while (PolicyFetchResult.EMPTY.equals(result) && scrollId != null) {
+ PolicyFetchResult tmpResult = fetchPolicies(query, count, scrollId.isEmpty() ? null : scrollId, authentication);
+ fetchedResults += tmpResult.getPolicies().size();
+ scrollId = tmpResult.getScrollId();
+ if (fetchedResults > start) {
+ result = tmpResult;
+ }
+ }
+
+ return result;
+ } catch (Exception e) {
+ throw new RuntimeException("Failed to list policies", e);
+ }
+ });
}
- public PolicyFetchResult fetchPolicies(int start, int count, String query, Authentication authentication)
+ public PolicyFetchResult fetchPolicies(int count, @Nullable String scrollId, Authentication authentication)
+ throws RemoteInvocationException, URISyntaxException {
+ return fetchPolicies("", count, scrollId, authentication);
+ }
+
+ public PolicyFetchResult fetchPolicies(String query, int count, @Nullable String scrollId, Authentication authentication)
throws RemoteInvocationException, URISyntaxException {
- log.debug(String.format("Batch fetching policies. start: %s, count: %s ", start, count));
- // First fetch all policy urns from start - start + count
- SearchResult result =
- _entityClient.search(POLICY_ENTITY_NAME, query, null, POLICY_SORT_CRITERION, start, count, authentication,
- new SearchFlags().setFulltext(true));
+ log.debug(String.format("Batch fetching policies. count: %s, scroll: %s", count, scrollId));
+
+ // First fetch all policy urns
+ ScrollResult result = _entityClient.scrollAcrossEntities(List.of(POLICY_ENTITY_NAME), query, null, scrollId,
+ null, count, new SearchFlags().setSkipCache(true).setSkipAggregates(true)
+ .setSkipHighlighting(true).setFulltext(true), authentication);
List policyUrns = result.getEntities().stream().map(SearchEntity::getEntity).collect(Collectors.toList());
if (policyUrns.isEmpty()) {
- return new PolicyFetchResult(Collections.emptyList(), 0);
+ return PolicyFetchResult.EMPTY;
}
// Fetch DataHubPolicyInfo aspects for each urn
@@ -64,7 +98,7 @@ public PolicyFetchResult fetchPolicies(int start, int count, String query, Authe
.filter(Objects::nonNull)
.map(this::extractPolicy)
.filter(Objects::nonNull)
- .collect(Collectors.toList()), result.getNumEntities());
+ .collect(Collectors.toList()), result.getNumEntities(), result.getScrollId());
}
private Policy extractPolicy(EntityResponse entityResponse) {
@@ -82,6 +116,10 @@ private Policy extractPolicy(EntityResponse entityResponse) {
public static class PolicyFetchResult {
List policies;
int total;
+ @Nullable
+ String scrollId;
+
+ public static final PolicyFetchResult EMPTY = new PolicyFetchResult(Collections.emptyList(), 0, null);
}
@Value
diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java
index 24ecfa6fefc856..babb1c5d00ee8a 100644
--- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java
+++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java
@@ -22,6 +22,7 @@
import com.linkedin.entity.EnvelopedAspectMap;
import com.linkedin.entity.client.EntityClient;
import com.linkedin.metadata.query.SearchFlags;
+import com.linkedin.metadata.search.ScrollResult;
import com.linkedin.metadata.search.SearchEntity;
import com.linkedin.metadata.search.SearchEntityArray;
import com.linkedin.metadata.search.SearchResult;
@@ -35,6 +36,8 @@
import java.util.List;
import java.util.Map;
import java.util.Optional;
+import java.util.Set;
+
import org.testng.annotations.BeforeMethod;
import org.testng.annotations.Test;
@@ -89,30 +92,58 @@ public void setupTest() throws Exception {
final EnvelopedAspectMap childDomainPolicyAspectMap = new EnvelopedAspectMap();
childDomainPolicyAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(childDomainPolicy.data())));
- final SearchResult policySearchResult = new SearchResult();
- policySearchResult.setNumEntities(3);
- policySearchResult.setEntities(
- new SearchEntityArray(
- ImmutableList.of(
- new SearchEntity().setEntity(activePolicyUrn),
- new SearchEntity().setEntity(inactivePolicyUrn),
- new SearchEntity().setEntity(parentDomainPolicyUrn),
- new SearchEntity().setEntity(childDomainPolicyUrn)
- )
- )
- );
-
- when(_entityClient.search(eq("dataHubPolicy"), eq(""), isNull(), any(), anyInt(), anyInt(), any(),
- eq(new SearchFlags().setFulltext(true)))).thenReturn(policySearchResult);
- when(_entityClient.batchGetV2(eq(POLICY_ENTITY_NAME),
- eq(ImmutableSet.of(activePolicyUrn, inactivePolicyUrn, parentDomainPolicyUrn, childDomainPolicyUrn)), eq(null), any())).thenReturn(
- ImmutableMap.of(
- activePolicyUrn, new EntityResponse().setUrn(activePolicyUrn).setAspects(activeAspectMap),
- inactivePolicyUrn, new EntityResponse().setUrn(inactivePolicyUrn).setAspects(inactiveAspectMap),
- parentDomainPolicyUrn, new EntityResponse().setUrn(parentDomainPolicyUrn).setAspects(parentDomainPolicyAspectMap),
- childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspects(childDomainPolicyAspectMap)
- )
- );
+ final ScrollResult policySearchResult1 = new ScrollResult()
+ .setScrollId("1")
+ .setNumEntities(4)
+ .setEntities(
+ new SearchEntityArray(
+ ImmutableList.of(new SearchEntity().setEntity(activePolicyUrn))));
+
+ final ScrollResult policySearchResult2 = new ScrollResult()
+ .setScrollId("2")
+ .setNumEntities(4)
+ .setEntities(
+ new SearchEntityArray(
+ ImmutableList.of(new SearchEntity().setEntity(inactivePolicyUrn))));
+
+ final ScrollResult policySearchResult3 = new ScrollResult()
+ .setScrollId("3")
+ .setNumEntities(4)
+ .setEntities(
+ new SearchEntityArray(
+ ImmutableList.of(new SearchEntity().setEntity(parentDomainPolicyUrn))));
+
+ final ScrollResult policySearchResult4 = new ScrollResult()
+ .setNumEntities(4)
+ .setEntities(
+ new SearchEntityArray(
+ ImmutableList.of(
+ new SearchEntity().setEntity(childDomainPolicyUrn))));
+
+ when(_entityClient.scrollAcrossEntities(eq(List.of("dataHubPolicy")), eq(""), isNull(), any(), isNull(),
+ anyInt(), eq(new SearchFlags().setFulltext(true).setSkipAggregates(true).setSkipHighlighting(true).setSkipCache(true)), any()))
+ .thenReturn(policySearchResult1)
+ .thenReturn(policySearchResult2)
+ .thenReturn(policySearchResult3)
+ .thenReturn(policySearchResult4);
+
+ when(_entityClient.batchGetV2(eq(POLICY_ENTITY_NAME), any(), eq(null), any())).thenAnswer(args -> {
+ Set inputUrns = args.getArgument(1);
+ Urn urn = inputUrns.stream().findFirst().get();
+
+ switch (urn.toString()) {
+ case "urn:li:dataHubPolicy:0":
+ return Map.of(activePolicyUrn, new EntityResponse().setUrn(activePolicyUrn).setAspects(activeAspectMap));
+ case "urn:li:dataHubPolicy:1":
+ return Map.of(inactivePolicyUrn, new EntityResponse().setUrn(inactivePolicyUrn).setAspects(inactiveAspectMap));
+ case "urn:li:dataHubPolicy:2":
+ return Map.of(parentDomainPolicyUrn, new EntityResponse().setUrn(parentDomainPolicyUrn).setAspects(parentDomainPolicyAspectMap));
+ case "urn:li:dataHubPolicy:3":
+ return Map.of(childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspects(childDomainPolicyAspectMap));
+ default:
+ throw new IllegalStateException();
+ }
+ });
final List userUrns = ImmutableList.of(Urn.createFromString("urn:li:corpuser:user3"), Urn.createFromString("urn:li:corpuser:user4"));
final List groupUrns = ImmutableList.of(Urn.createFromString("urn:li:corpGroup:group3"), Urn.createFromString("urn:li:corpGroup:group4"));
@@ -146,7 +177,8 @@ childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspec
_entityClient,
10,
10,
- DataHubAuthorizer.AuthorizationMode.DEFAULT
+ DataHubAuthorizer.AuthorizationMode.DEFAULT,
+ 1 // force pagination logic
);
_dataHubAuthorizer.init(Collections.emptyMap(), createAuthorizerContext(systemAuthentication, _entityClient));
_dataHubAuthorizer.invalidateCache();
diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml
index 91b10a75c922e2..e9113d339e81d7 100644
--- a/metadata-service/configuration/src/main/resources/application.yml
+++ b/metadata-service/configuration/src/main/resources/application.yml
@@ -39,6 +39,7 @@ authorization:
defaultAuthorizer:
enabled: ${AUTH_POLICIES_ENABLED:true}
cacheRefreshIntervalSecs: ${POLICY_CACHE_REFRESH_INTERVAL_SECONDS:120}
+ cachePolicyFetchSize: ${POLICY_CACHE_FETCH_SIZE:1000}
# Enables authorization of reads, writes, and deletes on REST APIs. Defaults to false for backwards compatibility, but should become true down the road
restApiAuthorization: ${REST_API_AUTHORIZATION_ENABLED:false}
diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java
index 5b298a453547a7..663234e2519faf 100644
--- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java
+++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java
@@ -32,6 +32,9 @@ public class DataHubAuthorizerFactory {
@Value("${authorization.defaultAuthorizer.cacheRefreshIntervalSecs}")
private Integer policyCacheRefreshIntervalSeconds;
+ @Value("${authorization.defaultAuthorizer.cachePolicyFetchSize}")
+ private Integer policyCacheFetchSize;
+
@Value("${authorization.defaultAuthorizer.enabled:true}")
private Boolean policiesEnabled;
@@ -44,6 +47,6 @@ protected DataHubAuthorizer getInstance() {
: DataHubAuthorizer.AuthorizationMode.ALLOW_ALL;
return new DataHubAuthorizer(systemAuthentication, entityClient, 10,
- policyCacheRefreshIntervalSeconds, mode);
+ policyCacheRefreshIntervalSeconds, mode, policyCacheFetchSize);
}
}
diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java
index b9661ec75e1b1f..84d0ed6b9594df 100644
--- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java
+++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java
@@ -241,7 +241,7 @@ public SearchResult searchAcrossEntities(@Nonnull List entities, @Nonnul
*/
@Nonnull
ScrollResult scrollAcrossEntities(@Nonnull List entities, @Nonnull String input,
- @Nullable Filter filter, @Nullable String scrollId, @Nonnull String keepAlive, int count, @Nullable SearchFlags searchFlags,
+ @Nullable Filter filter, @Nullable String scrollId, @Nullable String keepAlive, int count, @Nullable SearchFlags searchFlags,
@Nonnull Authentication authentication)
throws RemoteInvocationException;
diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java
index 47a00e711a9350..2716e27518fcc5 100644
--- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java
+++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java
@@ -482,11 +482,11 @@ public SearchResult searchAcrossEntities(@Nonnull List entities, @Nonnul
@Nonnull
@Override
public ScrollResult scrollAcrossEntities(@Nonnull List entities, @Nonnull String input,
- @Nullable Filter filter, @Nullable String scrollId, @Nonnull String keepAlive, int count,
+ @Nullable Filter filter, @Nullable String scrollId, @Nullable String keepAlive, int count,
@Nullable SearchFlags searchFlags, @Nonnull Authentication authentication)
throws RemoteInvocationException {
final EntitiesDoScrollAcrossEntitiesRequestBuilder requestBuilder =
- ENTITIES_REQUEST_BUILDERS.actionScrollAcrossEntities().inputParam(input).countParam(count).keepAliveParam(keepAlive);
+ ENTITIES_REQUEST_BUILDERS.actionScrollAcrossEntities().inputParam(input).countParam(count);
if (entities != null) {
requestBuilder.entitiesParam(new StringArray(entities));
@@ -500,6 +500,9 @@ public ScrollResult scrollAcrossEntities(@Nonnull List entities, @Nonnul
if (searchFlags != null) {
requestBuilder.searchFlagsParam(searchFlags);
}
+ if (keepAlive != null) {
+ requestBuilder.keepAliveParam(keepAlive);
+ }
return sendClientRequest(requestBuilder, authentication).getEntity();
}
diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
index 9559435ff01c85..8d689c7e2303c4 100644
--- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
+++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
@@ -81,7 +81,7 @@ describe("create and manage group", () => {
cy.focused().type(expected_name);
cy.get(".ant-select-item-option").contains(expected_name, { matchCase: false }).click();
cy.focused().blur();
- cy.contains(expected_name).should("have.length", 1);
+ cy.contains(expected_name, { matchCase: false }).should("have.length", 1);
cy.get('[role="dialog"] button').contains("Done").click();
cy.waitTextVisible("Owners Added");
cy.contains(expected_name, { matchCase: false }).should("be.visible");
From 353584c10acbee7554c2eb255512173f24e86785 Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Tue, 7 Nov 2023 18:22:18 -0600
Subject: [PATCH 23/33] feat(browsepathv2): Allow system-update to reprocess
browse paths v2 (#9200)
---
.../steps/BackfillBrowsePathsV2Step.java | 86 ++++++++++++++-----
.../env/docker-without-neo4j.env | 1 +
docker/datahub-upgrade/env/docker.env | 1 +
docker/docker-compose.dev.yml | 4 +
.../docker-compose-m1.quickstart.yml | 1 +
...er-compose-without-neo4j-m1.quickstart.yml | 1 +
...ocker-compose-without-neo4j.quickstart.yml | 1 +
.../quickstart/docker-compose.quickstart.yml | 1 +
.../client/CachingEntitySearchService.java | 16 ++--
.../elasticsearch/query/ESSearchDAO.java | 4 +-
.../query/request/SearchRequestHandler.java | 8 +-
.../src/main/resources/application.yml | 2 +
.../metadata/search/EntitySearchService.java | 4 +-
13 files changed, 94 insertions(+), 36 deletions(-)
diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/entity/steps/BackfillBrowsePathsV2Step.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/entity/steps/BackfillBrowsePathsV2Step.java
index 7547186ccfb230..08a752d9597f42 100644
--- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/entity/steps/BackfillBrowsePathsV2Step.java
+++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/entity/steps/BackfillBrowsePathsV2Step.java
@@ -6,6 +6,7 @@
import com.linkedin.common.BrowsePathsV2;
import com.linkedin.common.urn.Urn;
import com.linkedin.common.urn.UrnUtils;
+import com.linkedin.data.template.StringArray;
import com.linkedin.datahub.upgrade.UpgradeContext;
import com.linkedin.datahub.upgrade.UpgradeStep;
import com.linkedin.datahub.upgrade.UpgradeStepResult;
@@ -13,6 +14,7 @@
import com.linkedin.events.metadata.ChangeType;
import com.linkedin.metadata.Constants;
import com.linkedin.metadata.entity.EntityService;
+import com.linkedin.metadata.query.SearchFlags;
import com.linkedin.metadata.query.filter.Condition;
import com.linkedin.metadata.query.filter.ConjunctiveCriterion;
import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray;
@@ -37,6 +39,8 @@
public class BackfillBrowsePathsV2Step implements UpgradeStep {
public static final String BACKFILL_BROWSE_PATHS_V2 = "BACKFILL_BROWSE_PATHS_V2";
+ public static final String REPROCESS_DEFAULT_BROWSE_PATHS_V2 = "REPROCESS_DEFAULT_BROWSE_PATHS_V2";
+ public static final String DEFAULT_BROWSE_PATH_V2 = "␟Default";
private static final Set ENTITY_TYPES_TO_MIGRATE = ImmutableSet.of(
Constants.DATASET_ENTITY_NAME,
@@ -81,27 +85,14 @@ public Function executable() {
private String backfillBrowsePathsV2(String entityType, AuditStamp auditStamp, String scrollId) {
- // Condition: has `browsePaths` AND does NOT have `browsePathV2`
- Criterion missingBrowsePathV2 = new Criterion();
- missingBrowsePathV2.setCondition(Condition.IS_NULL);
- missingBrowsePathV2.setField("browsePathV2");
- // Excludes entities without browsePaths
- Criterion hasBrowsePathV1 = new Criterion();
- hasBrowsePathV1.setCondition(Condition.EXISTS);
- hasBrowsePathV1.setField("browsePaths");
-
- CriterionArray criterionArray = new CriterionArray();
- criterionArray.add(missingBrowsePathV2);
- criterionArray.add(hasBrowsePathV1);
-
- ConjunctiveCriterion conjunctiveCriterion = new ConjunctiveCriterion();
- conjunctiveCriterion.setAnd(criterionArray);
+ final Filter filter;
- ConjunctiveCriterionArray conjunctiveCriterionArray = new ConjunctiveCriterionArray();
- conjunctiveCriterionArray.add(conjunctiveCriterion);
-
- Filter filter = new Filter();
- filter.setOr(conjunctiveCriterionArray);
+ if (System.getenv().containsKey(REPROCESS_DEFAULT_BROWSE_PATHS_V2)
+ && Boolean.parseBoolean(System.getenv(REPROCESS_DEFAULT_BROWSE_PATHS_V2))) {
+ filter = backfillDefaultBrowsePathsV2Filter();
+ } else {
+ filter = backfillBrowsePathsV2Filter();
+ }
final ScrollResult scrollResult = _searchService.scrollAcrossEntities(
ImmutableList.of(entityType),
@@ -109,9 +100,9 @@ private String backfillBrowsePathsV2(String entityType, AuditStamp auditStamp, S
filter,
null,
scrollId,
- "5m",
+ null,
BATCH_SIZE,
- null
+ new SearchFlags().setFulltext(true).setSkipCache(true).setSkipHighlighting(true).setSkipAggregates(true)
);
if (scrollResult.getNumEntities() == 0 || scrollResult.getEntities().size() == 0) {
return null;
@@ -129,6 +120,55 @@ private String backfillBrowsePathsV2(String entityType, AuditStamp auditStamp, S
return scrollResult.getScrollId();
}
+ private Filter backfillBrowsePathsV2Filter() {
+ // Condition: has `browsePaths` AND does NOT have `browsePathV2`
+ Criterion missingBrowsePathV2 = new Criterion();
+ missingBrowsePathV2.setCondition(Condition.IS_NULL);
+ missingBrowsePathV2.setField("browsePathV2");
+ // Excludes entities without browsePaths
+ Criterion hasBrowsePathV1 = new Criterion();
+ hasBrowsePathV1.setCondition(Condition.EXISTS);
+ hasBrowsePathV1.setField("browsePaths");
+
+ CriterionArray criterionArray = new CriterionArray();
+ criterionArray.add(missingBrowsePathV2);
+ criterionArray.add(hasBrowsePathV1);
+
+ ConjunctiveCriterion conjunctiveCriterion = new ConjunctiveCriterion();
+ conjunctiveCriterion.setAnd(criterionArray);
+
+ ConjunctiveCriterionArray conjunctiveCriterionArray = new ConjunctiveCriterionArray();
+ conjunctiveCriterionArray.add(conjunctiveCriterion);
+
+ Filter filter = new Filter();
+ filter.setOr(conjunctiveCriterionArray);
+ return filter;
+ }
+
+ private Filter backfillDefaultBrowsePathsV2Filter() {
+ // Condition: has default `browsePathV2`
+ Criterion hasDefaultBrowsePathV2 = new Criterion();
+ hasDefaultBrowsePathV2.setCondition(Condition.EQUAL);
+ hasDefaultBrowsePathV2.setField("browsePathV2");
+ StringArray values = new StringArray();
+ values.add(DEFAULT_BROWSE_PATH_V2);
+ hasDefaultBrowsePathV2.setValues(values);
+ hasDefaultBrowsePathV2.setValue(DEFAULT_BROWSE_PATH_V2); // not used, but required field?
+
+ CriterionArray criterionArray = new CriterionArray();
+ criterionArray.add(hasDefaultBrowsePathV2);
+
+ ConjunctiveCriterion conjunctiveCriterion = new ConjunctiveCriterion();
+ conjunctiveCriterion.setAnd(criterionArray);
+
+ ConjunctiveCriterionArray conjunctiveCriterionArray = new ConjunctiveCriterionArray();
+ conjunctiveCriterionArray.add(conjunctiveCriterion);
+
+ Filter filter = new Filter();
+ filter.setOr(conjunctiveCriterionArray);
+ return filter;
+ }
+
private void ingestBrowsePathsV2(Urn urn, AuditStamp auditStamp) throws Exception {
BrowsePathsV2 browsePathsV2 = _entityService.buildDefaultBrowsePathV2(urn, true);
log.debug(String.format("Adding browse path v2 for urn %s with value %s", urn, browsePathsV2));
@@ -142,7 +182,7 @@ private void ingestBrowsePathsV2(Urn urn, AuditStamp auditStamp) throws Exceptio
_entityService.ingestProposal(
proposal,
auditStamp,
- false
+ true
);
}
diff --git a/docker/datahub-upgrade/env/docker-without-neo4j.env b/docker/datahub-upgrade/env/docker-without-neo4j.env
index c399f71b7b15c8..04d888f076cd68 100644
--- a/docker/datahub-upgrade/env/docker-without-neo4j.env
+++ b/docker/datahub-upgrade/env/docker-without-neo4j.env
@@ -21,6 +21,7 @@ DATAHUB_GMS_PORT=8080
ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
BACKFILL_BROWSE_PATHS_V2=true
+REPROCESS_DEFAULT_BROWSE_PATHS_V2=${REPROCESS_DEFAULT_BROWSE_PATHS_V2:-false}
# Uncomment and set these to support SSL connection to Elasticsearch
# ELASTICSEARCH_USE_SSL=
diff --git a/docker/datahub-upgrade/env/docker.env b/docker/datahub-upgrade/env/docker.env
index 491470406153b2..b2a0d01e5d4ae8 100644
--- a/docker/datahub-upgrade/env/docker.env
+++ b/docker/datahub-upgrade/env/docker.env
@@ -25,6 +25,7 @@ DATAHUB_GMS_PORT=8080
ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
BACKFILL_BROWSE_PATHS_V2=true
+REPROCESS_DEFAULT_BROWSE_PATHS_V2=${REPROCESS_DEFAULT_BROWSE_PATHS_V2:-false}
# Uncomment and set these to support SSL connection to Elasticsearch
# ELASTICSEARCH_USE_SSL=
diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml
index c4e5ee7fa0cae9..774c4e17bee21f 100644
--- a/docker/docker-compose.dev.yml
+++ b/docker/docker-compose.dev.yml
@@ -54,6 +54,8 @@ services:
- ${HOME}/.datahub/plugins:/etc/datahub/plugins
datahub-upgrade:
image: acryldata/datahub-upgrade:debug
+ ports:
+ - ${DATAHUB_MAPPED_UPGRADE_DEBUG_PORT:-5003}:5003
build:
context: datahub-upgrade
dockerfile: Dockerfile
@@ -63,6 +65,8 @@ services:
- SKIP_ELASTICSEARCH_CHECK=false
- DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-dev}
- DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true}
+ - REPROCESS_DEFAULT_BROWSE_PATHS_V2=${REPROCESS_DEFAULT_BROWSE_PATHS_V2:-false}
+ - JAVA_TOOL_OPTIONS=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5003
volumes:
- ../datahub-upgrade/build/libs/:/datahub/datahub-upgrade/bin/
- ../metadata-models/src/main/resources/:/datahub/datahub-gms/resources
diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml
index 3b6d02c83d0f07..c96baf37551b29 100644
--- a/docker/quickstart/docker-compose-m1.quickstart.yml
+++ b/docker/quickstart/docker-compose-m1.quickstart.yml
@@ -151,6 +151,7 @@ services:
- DATAHUB_GMS_PORT=8080
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- BACKFILL_BROWSE_PATHS_V2=true
+ - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false
hostname: datahub-upgrade
image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head}
labels:
diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml
index e45bafc3da480e..b1cb6c208a42d6 100644
--- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml
+++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml
@@ -144,6 +144,7 @@ services:
- DATAHUB_GMS_PORT=8080
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- BACKFILL_BROWSE_PATHS_V2=true
+ - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false
hostname: datahub-upgrade
image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head}
labels:
diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml
index 020ef5e9a97b96..ab5182bf98ae50 100644
--- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml
+++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml
@@ -144,6 +144,7 @@ services:
- DATAHUB_GMS_PORT=8080
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- BACKFILL_BROWSE_PATHS_V2=true
+ - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false
hostname: datahub-upgrade
image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head}
labels:
diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml
index 8adc2b9063b840..8a66521cbb5221 100644
--- a/docker/quickstart/docker-compose.quickstart.yml
+++ b/docker/quickstart/docker-compose.quickstart.yml
@@ -151,6 +151,7 @@ services:
- DATAHUB_GMS_PORT=8080
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- BACKFILL_BROWSE_PATHS_V2=true
+ - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false
hostname: datahub-upgrade
image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head}
labels:
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java
index ceaf37a1289d99..db414d70603dc7 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java
@@ -16,7 +16,7 @@
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import lombok.RequiredArgsConstructor;
-import org.javatuples.Quintet;
+import org.javatuples.Septet;
import org.javatuples.Sextet;
import org.springframework.cache.Cache;
import org.springframework.cache.CacheManager;
@@ -154,8 +154,9 @@ public SearchResult getCachedSearchResults(
batchSize,
querySize -> getRawSearchResults(entityNames, query, filters, sortCriterion, querySize.getFrom(),
querySize.getSize(), flags, facets),
- querySize -> Sextet.with(entityNames, query, filters != null ? toJsonString(filters) : null,
- sortCriterion != null ? toJsonString(sortCriterion) : null, facets, querySize), flags, enableCache).getSearchResults(from, size);
+ querySize -> Septet.with(entityNames, query, filters != null ? toJsonString(filters) : null,
+ sortCriterion != null ? toJsonString(sortCriterion) : null, flags != null ? toJsonString(flags) : null,
+ facets, querySize), flags, enableCache).getSearchResults(from, size);
}
@@ -175,7 +176,8 @@ public AutoCompleteResult getCachedAutoCompleteResults(
if (enableCache(flags)) {
try (Timer.Context ignored2 = MetricUtils.timer(this.getClass(), "getCachedAutoCompleteResults_cache").time()) {
Timer.Context cacheAccess = MetricUtils.timer(this.getClass(), "autocomplete_cache_access").time();
- Object cacheKey = Quintet.with(entityName, input, field, filters != null ? toJsonString(filters) : null, limit);
+ Object cacheKey = Sextet.with(entityName, input, field, filters != null ? toJsonString(filters) : null,
+ flags != null ? toJsonString(flags) : null, limit);
String json = cache.get(cacheKey, String.class);
result = json != null ? toRecordTemplate(AutoCompleteResult.class, json) : null;
cacheAccess.stop();
@@ -210,7 +212,8 @@ public BrowseResult getCachedBrowseResults(
if (enableCache(flags)) {
try (Timer.Context ignored2 = MetricUtils.timer(this.getClass(), "getCachedBrowseResults_cache").time()) {
Timer.Context cacheAccess = MetricUtils.timer(this.getClass(), "browse_cache_access").time();
- Object cacheKey = Quintet.with(entityName, path, filters != null ? toJsonString(filters) : null, from, size);
+ Object cacheKey = Sextet.with(entityName, path, filters != null ? toJsonString(filters) : null,
+ flags != null ? toJsonString(flags) : null, from, size);
String json = cache.get(cacheKey, String.class);
result = json != null ? toRecordTemplate(BrowseResult.class, json) : null;
cacheAccess.stop();
@@ -247,9 +250,10 @@ public ScrollResult getCachedScrollResults(
ScrollResult result;
if (enableCache(flags)) {
Timer.Context cacheAccess = MetricUtils.timer(this.getClass(), "scroll_cache_access").time();
- Object cacheKey = Sextet.with(entities, query,
+ Object cacheKey = Septet.with(entities, query,
filters != null ? toJsonString(filters) : null,
sortCriterion != null ? toJsonString(sortCriterion) : null,
+ flags != null ? toJsonString(flags) : null,
scrollId, size);
String json = cache.get(cacheKey, String.class);
result = json != null ? toRecordTemplate(ScrollResult.class, json) : null;
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java
index cbaf70ca22617d..290e8c60deb000 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java
@@ -157,7 +157,7 @@ private AggregationMetadataArray transformIndexIntoEntityName(AggregationMetadat
@Nonnull
@WithSpan
private ScrollResult executeAndExtract(@Nonnull List entitySpecs, @Nonnull SearchRequest searchRequest, @Nullable Filter filter,
- @Nullable String scrollId, @Nonnull String keepAlive, int size) {
+ @Nullable String scrollId, @Nullable String keepAlive, int size) {
try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "executeAndExtract_scroll").time()) {
final SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT);
// extract results, validated against document model as well
@@ -166,7 +166,7 @@ private ScrollResult executeAndExtract(@Nonnull List entitySpecs, @N
.extractScrollResult(searchResponse,
filter, scrollId, keepAlive, size, supportsPointInTime()));
} catch (Exception e) {
- log.error("Search query failed", e);
+ log.error("Search query failed: {}", searchRequest, e);
throw new ESQueryException("Search query failed:", e);
}
}
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java
index 49571a60d5f211..0df6afd49c3735 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java
@@ -241,7 +241,9 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi
BoolQueryBuilder filterQuery = getFilterQuery(filter);
searchSourceBuilder.query(QueryBuilders.boolQuery().must(getQuery(input, finalSearchFlags.isFulltext())).filter(filterQuery));
- _aggregationQueryBuilder.getAggregations().forEach(searchSourceBuilder::aggregation);
+ if (!finalSearchFlags.isSkipAggregates()) {
+ _aggregationQueryBuilder.getAggregations().forEach(searchSourceBuilder::aggregation);
+ }
if (!finalSearchFlags.isSkipHighlighting()) {
searchSourceBuilder.highlighter(_highlights);
}
@@ -366,7 +368,7 @@ public SearchResult extractResult(@Nonnull SearchResponse searchResponse, Filter
@WithSpan
public ScrollResult extractScrollResult(@Nonnull SearchResponse searchResponse, Filter filter, @Nullable String scrollId,
- @Nonnull String keepAlive, int size, boolean supportsPointInTime) {
+ @Nullable String keepAlive, int size, boolean supportsPointInTime) {
int totalCount = (int) searchResponse.getHits().getTotalHits().value;
List resultList = getResults(searchResponse);
SearchResultMetadata searchResultMetadata = extractSearchResultMetadata(searchResponse, filter);
@@ -376,7 +378,7 @@ public ScrollResult extractScrollResult(@Nonnull SearchResponse searchResponse,
if (searchHits.length == size) {
Object[] sort = searchHits[searchHits.length - 1].getSortValues();
long expirationTimeMs = 0L;
- if (supportsPointInTime) {
+ if (keepAlive != null && supportsPointInTime) {
expirationTimeMs = TimeValue.parseTimeValue(keepAlive, "expirationTime").getMillis() + System.currentTimeMillis();
}
nextScrollId = new SearchAfterWrapper(sort, searchResponse.pointInTimeId(), expirationTimeMs).toScrollId();
diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml
index e9113d339e81d7..a06891699607bb 100644
--- a/metadata-service/configuration/src/main/resources/application.yml
+++ b/metadata-service/configuration/src/main/resources/application.yml
@@ -285,6 +285,8 @@ bootstrap:
enabled: ${UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED:false} # enable to run the upgrade to migrate legacy default browse paths to new ones
backfillBrowsePathsV2:
enabled: ${BACKFILL_BROWSE_PATHS_V2:false} # Enables running the backfill of browsePathsV2 upgrade step. There are concerns about the load of this step so hiding it behind a flag. Deprecating in favor of running through SystemUpdate
+ reprocessDefaultBrowsePathsV2:
+ enabled: ${REPROCESS_DEFAULT_BROWSE_PATHS_V2:false} # reprocess V2 browse paths which were set to the default: {"path":[{"id":"Default"}]}
policies:
file: ${BOOTSTRAP_POLICIES_FILE:classpath:boot/policies.json}
# eg for local file
diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java
index 64f59780b887f3..cbfeeaef860d34 100644
--- a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java
+++ b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java
@@ -193,7 +193,7 @@ BrowseResult browse(@Nonnull String entityName, @Nonnull String path, @Nullable
*/
@Nonnull
ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters,
- @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size, @Nullable SearchFlags searchFlags);
+ @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size, @Nullable SearchFlags searchFlags);
/**
* Gets a list of documents that match given search request. The results are aggregated and filters are applied to the
@@ -210,7 +210,7 @@ ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String inpu
*/
@Nonnull
ScrollResult structuredScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters,
- @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size, @Nullable SearchFlags searchFlags);
+ @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size, @Nullable SearchFlags searchFlags);
/**
* Max result size returned by the underlying search backend
From e73e92699947084b5ecb1f5d3e0c5762dc446bbf Mon Sep 17 00:00:00 2001
From: Shubham Jagtap <132359390+shubhamjagtap639@users.noreply.github.com>
Date: Wed, 8 Nov 2023 12:32:41 +0530
Subject: [PATCH 24/33] feat(integration/fivetran): Fivetran connector
integration (#9018)
Co-authored-by: Harshal Sheth
---
.../app/ingest/source/builder/constants.ts | 4 +
.../app/ingest/source/builder/sources.json | 7 +
datahub-web-react/src/images/fivetranlogo.png | Bin 0 -> 10230 bytes
.../docs/sources/fivetran/fivetran_pre.md | 86 +++
.../docs/sources/fivetran/fivetran_recipe.yml | 43 ++
metadata-ingestion/setup.py | 3 +
.../datahub/api/entities/datajob/datajob.py | 25 +-
.../dataprocess/dataprocess_instance.py | 27 +-
metadata-ingestion/src/datahub/emitter/mcp.py | 4 +-
.../datahub/ingestion/api/source_helpers.py | 13 +-
.../ingestion/source/fivetran/__init__.py | 0
.../ingestion/source/fivetran/config.py | 145 ++++
.../ingestion/source/fivetran/data_classes.py | 36 +
.../ingestion/source/fivetran/fivetran.py | 289 ++++++++
.../source/fivetran/fivetran_log_api.py | 147 ++++
.../source/fivetran/fivetran_query.py | 76 ++
.../ingestion/source_config/sql/snowflake.py | 82 ++-
.../integration/fivetran/fivetran_golden.json | 658 ++++++++++++++++++
.../integration/fivetran/test_fivetran.py | 192 +++++
.../main/resources/boot/data_platforms.json | 10 +
20 files changed, 1777 insertions(+), 70 deletions(-)
create mode 100644 datahub-web-react/src/images/fivetranlogo.png
create mode 100644 metadata-ingestion/docs/sources/fivetran/fivetran_pre.md
create mode 100644 metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/__init__.py
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
create mode 100644 metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
create mode 100644 metadata-ingestion/tests/integration/fivetran/fivetran_golden.json
create mode 100644 metadata-ingestion/tests/integration/fivetran/test_fivetran.py
diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts
index dba8e8bb1dce6b..fdb094d721304b 100644
--- a/datahub-web-react/src/app/ingest/source/builder/constants.ts
+++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts
@@ -29,6 +29,7 @@ import databricksLogo from '../../../../images/databrickslogo.png';
import verticaLogo from '../../../../images/verticalogo.png';
import mlflowLogo from '../../../../images/mlflowlogo.png';
import dynamodbLogo from '../../../../images/dynamodblogo.png';
+import fivetranLogo from '../../../../images/fivetranlogo.png';
export const ATHENA = 'athena';
export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`;
@@ -105,6 +106,8 @@ export const DBT_CLOUD = 'dbt-cloud';
export const DBT_CLOUD_URN = `urn:li:dataPlatform:dbt`;
export const VERTICA = 'vertica';
export const VERTICA_URN = `urn:li:dataPlatform:${VERTICA}`;
+export const FIVETRAN = 'fivetran';
+export const FIVETRAN_URN = `urn:li:dataPlatform:${FIVETRAN}`;
export const PLATFORM_URN_TO_LOGO = {
[ATHENA_URN]: athenaLogo,
@@ -138,6 +141,7 @@ export const PLATFORM_URN_TO_LOGO = {
[SUPERSET_URN]: supersetLogo,
[UNITY_CATALOG_URN]: databricksLogo,
[VERTICA_URN]: verticaLogo,
+ [FIVETRAN_URN]: fivetranLogo,
};
export const SOURCE_TO_PLATFORM_URN = {
diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json
index b18384909c33f0..9619abebbd54e6 100644
--- a/datahub-web-react/src/app/ingest/source/builder/sources.json
+++ b/datahub-web-react/src/app/ingest/source/builder/sources.json
@@ -216,6 +216,13 @@
"docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/vertica/",
"recipe": "source:\n type: vertica\n config:\n # Coordinates\n host_port: localhost:5433\n # The name of the vertica database\n database: Database_Name\n # Credentials\n username: Vertica_User\n password: Vertica_Password\n\n include_tables: true\n include_views: true\n include_projections: true\n include_models: true\n include_view_lineage: true\n include_projection_lineage: true\n profiling:\n enabled: false\n stateful_ingestion:\n enabled: true "
},
+ {
+ "urn": "urn:li:dataPlatform:fivetran",
+ "name": "fivetran",
+ "displayName": "Fivetran",
+ "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/fivetran/",
+ "recipe": "source:\n type: fivetran\n config:\n # Fivetran log connector destination server configurations\n fivetran_log_config:\n destination_platform: snowflake\n destination_config:\n # Coordinates\n account_id: snowflake_account_id\n warehouse: warehouse_name\n database: snowflake_db\n log_schema: fivetran_log_schema\n\n # Credentials\n username: ${SNOWFLAKE_USER}\n password: ${SNOWFLAKE_PASS}\n role: snowflake_role\n\n # Optional - filter for certain connector names instead of ingesting everything.\n # connector_patterns:\n # allow:\n # - connector_name\n\n # Optional -- This mapping is optional and only required to configure platform-instance for source\n # A mapping of Fivetran connector id to data platform instance\n # sources_to_platform_instance:\n # calendar_elected:\n # platform_instance: cloud_postgres_instance\n # env: DEV\n\n # Optional -- This mapping is optional and only required to configure platform-instance for destination.\n # A mapping of Fivetran destination id to data platform instance\n # destination_to_platform_instance:\n # calendar_elected:\n # platform_instance: cloud_postgres_instance\n # env: DEV"
+ },
{
"urn": "urn:li:dataPlatform:custom",
"name": "custom",
diff --git a/datahub-web-react/src/images/fivetranlogo.png b/datahub-web-react/src/images/fivetranlogo.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c999ad2d86e99273971dd0d31a18fd5e94733b
GIT binary patch
literal 10230
zcmZX42{@G9`@c1NmIh-dEmUN(?;)BZ%h*YT?6QWj#Sp!d7?E8eOZG^zXW!RES(+?a
zLUvgye&?BaU)T44{jY1T>ABCj&;7a2=U&cvO#kR=UuU4@q$MFCVYq?3h9V&$1CL}R
z)a2lwmfSO)BqZb{x>^S6KqT=#BJukFJ`avaJdQ{_|7ZO8`S{^@mCw8c~zvKOiu5ycXyamw<0D2ylt||NBkECHe!l
z29^Ugq7fhxM|}T7Bw9O0NB^h!zlgx#v4=!`LxLKSieqSC1kiYF3TP6sjvVPRQPIQ2%|G!g2CyAQJ)&M8RYl+K=
z;_)DO08{{dzyCDB2od_&2+=a}`f6X30106TQ6wxdgKb`_cz=tUQ
z?;!E{cosYmfsP-5DFV9H!xSk{0!Q8`ZG959;I7r*v$+~H&UK52B{zg(K5g%P#&WvU
z^IV^QbMi#&pLJcFu;t$#*iG9>dZ~lXgaz7VpZ3e16Qq}(Wy!gXG(O!-YNp!JzZ21F
z$W%r)edR5yoeEsumGRdL_p$E$tyBFEwaiOJ`6;SUPrj2!bKg@#c*i3%hF(7q+tsx2
zx2gHT)g;t5$GM7qCs})z;^FX+sBdwKk|`T#y+?j6wbC8|
z>Mwa!QybVdm#r{r0
z>HRg+$h6x!3ZwrF^cg+3%6Vn|AyB5gSA^lKKD9pyiD2%HYbpkQ<3H2)KP1{5cEos|
zk-NYks9EbYh++21dZs1B>0EhcOf}^a^N_IfYdcj`RQye3Wvqz#m{X_i%C$#DEnlqR
zyPtc*))=?^CKq;;UVdJ#`LkI%NxD?iEKlKUjK^2st5CR$wNw)RURTd#aJDqs1%yZ5v!Sl_YHI|q?Y
ze?+rl)_G^<@WIl
z9_83;Q?7ygm}*y1u;z*!auRU2yBsxU9x_ISI#|aw;3HtgL7KAUe=h%wMEqUaO!~H?
z7XKIBNeU)P?gS3X@jOOQZ6vK&AsuzZyBRm6vK@BVWxRyixqW0}WXVt`=LD|%bD^qE
ztgFa(hj4V-L`OtMhXt#sMQ{9i<<2ml-c@7WTSnIFf%8;glx>>8Wj(N_F|Ifg`4f6a
z%9e`PizPd)S4|t(b*G6Yi+F)WwD@V&eiKA`9XJU(i?xCUl4-u@FRO
z)NX26Bk?Ax1Je`zV+wXQ)IU1ame9ZljZGD2>rPsk$PU{D)$Vf=HlGOF+oI@Ci$Q$2
zq-ef;G;#{1v4y(;EF{d32StT?vBYrDeHav-;(JS1&mere#m|16qg*X!7QvxC@#-Vl
z(JSneItF1gEaUbQL~ig!UN4$#YEfRQ002k9x2Q;;BbvpmVAMeH_U4{Zyo0h`G{fM&JPo_7~i5am&`DlUKGZrU?)iM>~dapgy1K0zo<;zq=&sumUdr?$?WN`zs3ObQpn^E
zwFp351%>QRuAqU^z4KhObdM-w#(sj0imGVW&rv^j)3g4njn(K5!(ep_>^UNtgk8y
zVaOrn7PSZ{iKC~(DqG#dYvMbm=za?@ICPq*HJRV8iSs>IOKwv`BW8#@z4$1&=)Hit
zbGmlmV84X>VKW4NPr)|hy}+YoGbM)sP7UOzX+)K7)ru_V6{lFByX=HdUNMR?Jy+$|
z{HR%eg9$R~e^?qgD0Ri1Vi*genz7a9>83{exvx%KpFtd700nW6l6O-Y6w9bfs5qCE
zmUwV$T&Awc;Qov{RD*Mxpg(ZQ?D#?4c{liRz(VMT
zDw*p~;a(RII*0bts?(qSI+vmTG8Z_w9`(MwJed%YLp3PT+}8WyH>D{~<%m3LcH6|U
z0=;5uySnGk>xtQ@ERlxcPQ}lsHvyD;7tvAE&Tr}Z8ihN{Zn($Z7y2q>ZrD#qTjdHb
zC_c!S#t)1`ek6j}#q
z6|d09?VDV2Vqb^9
z3*Hf>eT?YG0xbfft|IQaC)4-aUr1Bv&%ayp?VjvP*=V
z5jh=k?Oc=Pz*r#pf*(}Wl*xA!bjGXB>@AOZKFv+Ch>okIxFxo>0}_em(7%8FZj`l;
ztoHapcx2if@NH(fp!@@u7-nov#<9>-p3{0i`>Ce=dcR4@5Rp21Ue5+1EqRYZ1%W-f
zM>^jJDn(vac-+u6zE`brs#g>cu-2vKWMTJ(7L+nalwPlB2iVoypXP)tkDkaqK?e}3
zbWPkV6&t}Bvm>__mD7X)o`C91a~*(=ZW_7!=A4%Z+nSD0uuTA=d>$9z3x4m!hP~R}E6Q7H;>RnK1
z*A)t+>`Pkyd3}0G$lZ=TN-ql62-@?A7o(dpHj&?QR-8QI$b*yi(SE@~JKhIYI#ee<
zMgt$Vgi^v~#7nb9+d9rVyjA!_KM53B>GN;N8rFSU)NY~eIpG>e`gufz>hy&r9~-S=
zXV`{YP?vn%(?+<;ci>4~ql-OlX(Q4uaViS;`hjHKBV=sp#?w&oXB|_j)7dQB
z#DjuaV!JSCn1>eV&YDN*Rh)TEkoYZa{t*y&fydNT-!5dv0xeC^j4iz@tIk?+
z22gP0ED~3GH@|?X;*1!K_Wa0gY#>9R@p;D^A#{)8oJ7?dPBaw8{f}nVf0ATkMqk2Q
ziT2Pl5_V`EM#?<5R!&8e?6y#v0fxf0@{;cZYyw4bo<1bqOXyW1X#Wy(j*31q*2D6~Hd#2MJsH_D5T>RQRhbeKG2
zeCAXX<7Z!PgU}dymNELy`_i?<)h2oI!N%kPn@fb%Akh+PYwFQj7hor|hAH~F!x3Qb
z`MNUU2Y+M1Ze6xKdE;mwg*ouyld(_4>=M-4v~+{+;z%NU#2;D|E?kI9qj^zX0g2w!_;}bGzk@^a?m+(F@du{GL{=jz}0KwX!~*3XoR32fwU4b`}<8v!D>Vb
z-}~90fQ#j=6BI9k?ko%6iYEaKsv(EI1R0Yrh!a_2#wX`7#QFJ76VUJr>0L_gNe?K9
zTws}5bU0-Si53~r_PHmUlm3;ADf$)uqaw3A(a*jHE+f4~Kyw2wKWs@FBZJ%$P`BBY
zMra_4&au)`#b%WK?ukfzYh^-UP{n33hl?s71E%KHgQ!#>vdgl1#BG4EkMhws$7j6o3)_xXur)Y74
zeEQigZ-bVlix`1oSd;48zUhhABP@%=WtG*=%>_~rl5(45EeZ%?L;=^x(b7}x$xW0L
z2~&au;S;FhP0t1tjw1ntWwB8klt7}FJuxowplSD@iK+bhGN?56_CGgpVJVR2{sXjq
zMZ-B}EhBl6i_U
zQsOsUCY6{W;1!l$G71OpBe2f^(m=Zfu71)9L@?71vswR^J}p-P3gHkEpL{MNaVwE1
zp+S@1hrjAnxdrhm^o`4AD)3($v*~3~IFpwIXgQ0i-nM_$RJ^6uv>3@X2%8C8F3>+o?Tu^+BA3yLU@ElIC>?>r}Cu^-3qA`Kcrc`4@OO6@$QXe}XC<%y69O
zwKoUSeN+(!G4$y@v)sEtMgS4&56JnyWtGqIw6Bi|sIg+a;bK
zF#)rMAiY&j^H7H4vI5u4$QM|6hnwY%JeIWZ$(w3oc3iw(Sl`z(pLhUO>2SeLm7l9T
z1<12P-{jjve%#Qn{1t7GePFasD?@7IW5>}`PtIDH@NSkb=3v>dJU$d_^lhE7
z@h>)_CQ4P~11QR^dP7x@BMFwaW^~gUrL+hF;9@g{+9KU2Q93P}vN_V{zr#i;?Q1diGjtx3Y!Z-a&Ltt%b17KrOJ`2SN
z+lameVK4)yIVj?&L2t+|0)bz0xz9YT7A-s6U4D5^?ke8LRo6a=4+s7@L>mp(TkVT
zOKBniP!@Dy&tpb51|$2@)Oby|EkBZC^b&yzMnGVG^mH*I-i^j=m;rPnWI^@n_u?)#
z)$U5Nn3AUPVvvXyx8=GY;#d6
z_iypf(UYEK2w=#%NI6O$9&d;tk7R>#Le{wcR|HP)MZ7&B
z3DC7c{5ioFUu;(TzCJx-gyCEG#;@)^CH4RZ^?^GWsws7(9=A;0G)S$3LfoQDK##lFf(hOcy)u1|X|L^Rk&)?s_Y_-SRn^UBRa7e$o>~
zgQDMSO1_I#VJ(DEDaeD%{|ek=V?=o8X@NkKD!cOMyb8h8>Ysa_K{JbXZlCQHYs$b5
zKoO#FTX12?j$+eYpPb2N$@Y`~z#t6n34ZrIP{J&A=?uF&x6s&3`Fj8)in)1r$C6eE
znxi>*YZax-j@0u$_q$u(Aa;ohq~5pB5Bti!8Q|U&)94)Rnj?9!x#C6uvI!TBh##PgkNa2hzlsI0
z8xX7wfbHR*v*Z2v!x+f=3v|38Y`81tSH~caZ+e(U%2ylQ@6ccJhg!jWy*-~AHVal9
z3)P1;{&yU%+(Z}allre{@F(<~kh-SendU`(e2NQaw$RO?Rf7MwM;}DM)P=_tAy7<@
zTnp8>E$=f8WO(dM`Kkso`Ao(XIKf%6tBTASgEJT$8};2avDv5!Gw`a$=iuu1IP
zt~z*s85;SF<2cu;C`Nzr<^`2gXlHuy23R$fpNSrbqW2CDi7>dX>uW&Ue8BccX5nhL
zse&!|lP}LOseRTnZB}
zOxA4$m1i4V0OYcs!oAzjjXwtz$qAh2>yVG%pa7>fa;DXkCYg@)5JM)kOe>6u=`-4a
zf^-6eA&6gnqipOmF}~CQQc3E=7J#g=Sf9O3EzT}&4#xvC4T89Os
z$nig^+E1&
zQzGe>7MXjh(zX$b&1&wEGbU#pYD_mGCr1Kkc}W6|w;Dn)^04a#e<_<%V`~qpF(Vrm
zPqQvEtm<+Xj&C*93Lmu|x-^{%S#zSWaHM~`EciUDW+rZOWPrKoP;xRX>tf$5Uc=83
zml3}^KRLo$9O&Otn*z>k+7}jlh0zOP($XJlW+;PqRy3<3r72+wwMwY;OVs$0>cWR-
zO#UiWMH*AyqjvM>n$DU3Fdz0_&Uc50^8Bp`k8c)46aC-Aj1R#pE5=9(493t2mHGkQ
zXv|b`uZ6`>H&x8+?&hSre-~%Gho3*QsggC?r5Pu)BYoIQaW}NeS|cn85~}(mgKv(0Jt8#oE+c
zT^2#1Ry#V=yyLV&4E0afpJlDBKB+GJaM6Uv9{eD3ZyC)som&&Qey3lt%=nCpt$Cy3
zO1qlV8Z}KDF4EV__@$7^--}g|wUqG}+yv{g;hVUJ>A7%0gIkmzbx?ubtE;ZzsZujx
z#(fXQCI2jqnN?ANn~kLU$=Jeojah$|D;z=iWAr`Nyq~{kzLUb=Ngr?-kaQ##!MCuh
zFem-aC%pBNlHZ7yE&F#~uHdw!<*;JFM7so#B4QcD^Q+)*?OVrf!X0T
zA9efPBCg36Uj+&sd|Y%hn@Lfty?+T5LtZw3A^o>Ru^;3)JLVCN9J?8TeVF#Z;H>X?%owb0Cw%k?Wv4
z0U=QaJ@jGvv}Opw&~Sc=en}!6dtv3bJ~%mPtEkk&6(Q@?mQOY=n%S?iB+=UDPba
z6l(*2m=*p;(7p&MoaUuSW_ZO05-?|$f^9BlikbePC|7!C0A(w68>*?L_cLLM%9!hj
z#Z&eXhWJ+f1Ab(M9EK79$#Slw1QZY5WQ!|VHw+Y1DGf$i#CJ!WqEII?KzF+?nN&FT
z=Fm`%*XmLX&0)8g#P+G}y6XHk^=R&NMA4PV-8g(p~>YADlW<)d-N5mK-*B
z!4*P~RLKKth+$S(p0gXhpzT=hI>k%DLjyF6{MD?6>Ir*#8XlnQeP(YGA``W@CGO0c
z{*Q-Uk~Jr1iF%YF26p(T*Q^VkIqlA9xz3~}Vx5i3USF^u4dyrb>r)l^kXERQ4S^EA
z;=Hg>((ta4vqd)Dij#8{#aGC=)l_)yt(6o3(=i#p?0A{_GI*Xi74D4ks&<51iT1F#3OS$=_f$=
z%_&ExXtmos+8IFebmZr&J6C-@ADgYHMww^jjh1|LKUQJ^-JEKR|GQoiAl=J6ElZ>bBMh_4JlKWCrjg>`T|1(o=*8Z$M=mviT=iQ)$+
zPt#9ab!7b!T+jV)~QRcA-sgsjIq>5mpq;0@hK=1^1qQJ|iUg
zkaMk!%(>vlt-o_(<2T9Jm;JbZP(KnQAv?SHr2q9%-*NUB-s1;-X!M=`DbRMxrLMDsZICgSOCC^&OR
zr|^eZQ^_hxiz1e^&vWc!7m5(uzg`ZgxjWaGyekHc{qzI_W#I>g-8b(;Fy_PR1#i?V
zT`5Pkc*1|V=UW-|odx{=U;)unucUP@m!9GkT@Zk*ISIX4Y?>V5V=KZ_uHJnbf{9Gf
zDA0fZJ#6>Z>XJEiQIoJdc0SzC!Lf*?7oBgC_P)wCib;EnS>k-rUKrFgVPnCmwKW+L
z6S(qM4%#LiP}9S@Y>BtPVvK#nCe;FXH&e~zRcEwA)j!Xb941)pfJVy8s#^nfnFn$k
zP39@JHK#Y$7f(7chUG>~s;#g{{4(EtQ`dIZCY(1s;LGLjyw=8+q3ZjIK9~gWv9+nA
zfdI;{YTr?|5pzcz^{Q64DrLvjw(`_g1$-=;sMZ=Lp_MhwikBFwBfpG11w~Ov()MPd
z#bd_ji(;#2!?Uedf7%%5i_4+{7;7TG7)Cz*x@WilWcO9xjmp&7`p7TuWK&t+RMw27
zDqb(S?=;8i!&p{`?O~u0e!$MHR;-k}f7Hlp{h4+~r%93bdg15u^EWExhsV1_R={Y3
zuN04YH}?yB`G?MqZ$Ei7J<~f|q~z3vQqZb" to role fivetran_datahub;
+
+// Grant access to view database and schema in which your log and metadata tables exist
+grant usage on DATABASE "" to role fivetran_datahub;
+grant usage on SCHEMA ""."" to role fivetran_datahub;
+
+// Grant access to execute select query on schema in which your log and metadata tables exist
+grant select on all tables in SCHEMA ""."" to role fivetran_datahub;
+
+// Grant the fivetran_datahub to the snowflake user.
+grant role fivetran_datahub to user snowflake_user;
+```
+
+## Advanced Configurations
+
+### Working with Platform Instances
+If you've multiple instances of source/destination systems that are referred in your `fivetran` setup, you'd need to configure platform instance for these systems in `fivetran` recipe to generate correct lineage edges. Refer the document [Working with Platform Instances](https://datahubproject.io/docs/platform-instances) to understand more about this.
+
+While configuration of platform instance for source system you need to provide connector id as key and for destination system provide destination id as key.
+
+#### Example - Multiple Postgres Source Connectors each reading from different postgres instance
+```yml
+ # Map of connector source to platform instance
+ sources_to_platform_instance:
+ postgres_connector_id1:
+ platform_instance: cloud_postgres_instance
+ env: PROD
+
+ postgres_connector_id2:
+ platform_instance: local_postgres_instance
+ env: DEV
+```
+
+#### Example - Multiple Snowflake Destinations each writing to different snowflake instance
+```yml
+ # Map of destination to platform instance
+ destination_to_platform_instance:
+ snowflake_destination_id1:
+ platform_instance: prod_snowflake_instance
+ env: PROD
+
+ snowflake_destination_id2:
+ platform_instance: dev_snowflake_instance
+ env: PROD
+```
+
+
+
diff --git a/metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml b/metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml
new file mode 100644
index 00000000000000..7c654df59723c1
--- /dev/null
+++ b/metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml
@@ -0,0 +1,43 @@
+source:
+ type: fivetran
+ config:
+ # Fivetran log connector destination server configurations
+ fivetran_log_config:
+ destination_platform: snowflake
+ destination_config:
+ # Coordinates
+ account_id: "abc48144"
+ warehouse: "COMPUTE_WH"
+ database: "MY_SNOWFLAKE_DB"
+ log_schema: "FIVETRAN_LOG"
+
+ # Credentials
+ username: "${SNOWFLAKE_USER}"
+ password: "${SNOWFLAKE_PASS}"
+ role: "snowflake_role"
+
+ # Optional - filter for certain connector names instead of ingesting everything.
+ # connector_patterns:
+ # allow:
+ # - connector_name
+
+ # Optional -- A mapping of the connector's all sources to its database.
+ # sources_to_database:
+ # connector_id: source_db
+
+ # Optional -- This mapping is optional and only required to configure platform-instance for source
+ # A mapping of Fivetran connector id to data platform instance
+ # sources_to_platform_instance:
+ # connector_id:
+ # platform_instance: cloud_instance
+ # env: DEV
+
+ # Optional -- This mapping is optional and only required to configure platform-instance for destination.
+ # A mapping of Fivetran destination id to data platform instance
+ # destination_to_platform_instance:
+ # destination_id:
+ # platform_instance: cloud_instance
+ # env: DEV
+
+sink:
+ # sink configs
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index afce8dcee840b4..2392fce0580613 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -395,6 +395,7 @@
"powerbi-report-server": powerbi_report_server,
"vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.1"},
"unity-catalog": databricks | sqllineage_lib,
+ "fivetran": snowflake_common,
}
# This is mainly used to exclude plugins from the Docker image.
@@ -525,6 +526,7 @@
"nifi",
"vertica",
"mode",
+ "fivetran",
"kafka-connect",
]
if plugin
@@ -629,6 +631,7 @@
"unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource",
"gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource",
"sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource",
+ "fivetran = datahub.ingestion.source.fivetran.fivetran:FivetranSource",
],
"datahub.ingestion.transformer.plugins": [
"simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership",
diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py
index 0face6415bacc4..6c42e830e223b1 100644
--- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py
+++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py
@@ -100,7 +100,9 @@ def generate_tags_aspect(self) -> Iterable[GlobalTagsClass]:
)
return [tags]
- def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]:
+ def generate_mcp(
+ self, materialize_iolets: bool = True
+ ) -> Iterable[MetadataChangeProposalWrapper]:
mcp = MetadataChangeProposalWrapper(
entityUrn=str(self.urn),
aspect=DataJobInfoClass(
@@ -113,7 +115,9 @@ def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]:
)
yield mcp
- yield from self.generate_data_input_output_mcp()
+ yield from self.generate_data_input_output_mcp(
+ materialize_iolets=materialize_iolets
+ )
for owner in self.generate_ownership_aspect():
mcp = MetadataChangeProposalWrapper(
@@ -144,7 +148,9 @@ def emit(
for mcp in self.generate_mcp():
emitter.emit(mcp, callback)
- def generate_data_input_output_mcp(self) -> Iterable[MetadataChangeProposalWrapper]:
+ def generate_data_input_output_mcp(
+ self, materialize_iolets: bool
+ ) -> Iterable[MetadataChangeProposalWrapper]:
mcp = MetadataChangeProposalWrapper(
entityUrn=str(self.urn),
aspect=DataJobInputOutputClass(
@@ -157,10 +163,9 @@ def generate_data_input_output_mcp(self) -> Iterable[MetadataChangeProposalWrapp
yield mcp
# Force entity materialization
- for iolet in self.inlets + self.outlets:
- mcp = MetadataChangeProposalWrapper(
- entityUrn=str(iolet),
- aspect=StatusClass(removed=False),
- )
-
- yield mcp
+ if materialize_iolets:
+ for iolet in self.inlets + self.outlets:
+ yield MetadataChangeProposalWrapper(
+ entityUrn=str(iolet),
+ aspect=StatusClass(removed=False),
+ )
diff --git a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py
index cf6080c7072e69..2f07e4a112f934 100644
--- a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py
+++ b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py
@@ -220,12 +220,10 @@ def emit_process_end(
self._emit_mcp(mcp, emitter, callback)
def generate_mcp(
- self, created_ts_millis: Optional[int] = None
+ self, created_ts_millis: Optional[int] = None, materialize_iolets: bool = True
) -> Iterable[MetadataChangeProposalWrapper]:
- """
- Generates mcps from the object
- :rtype: Iterable[MetadataChangeProposalWrapper]
- """
+ """Generates mcps from the object"""
+
mcp = MetadataChangeProposalWrapper(
entityUrn=str(self.urn),
aspect=DataProcessInstanceProperties(
@@ -253,7 +251,7 @@ def generate_mcp(
)
yield mcp
- yield from self.generate_inlet_outlet_mcp()
+ yield from self.generate_inlet_outlet_mcp(materialize_iolets=materialize_iolets)
@staticmethod
def _emit_mcp(
@@ -329,7 +327,9 @@ def from_dataflow(dataflow: DataFlow, id: str) -> "DataProcessInstance":
dpi._template_object = dataflow
return dpi
- def generate_inlet_outlet_mcp(self) -> Iterable[MetadataChangeProposalWrapper]:
+ def generate_inlet_outlet_mcp(
+ self, materialize_iolets: bool
+ ) -> Iterable[MetadataChangeProposalWrapper]:
if self.inlets:
mcp = MetadataChangeProposalWrapper(
entityUrn=str(self.urn),
@@ -349,10 +349,9 @@ def generate_inlet_outlet_mcp(self) -> Iterable[MetadataChangeProposalWrapper]:
yield mcp
# Force entity materialization
- for iolet in self.inlets + self.outlets:
- mcp = MetadataChangeProposalWrapper(
- entityUrn=str(iolet),
- aspect=StatusClass(removed=False),
- )
-
- yield mcp
+ if materialize_iolets:
+ for iolet in self.inlets + self.outlets:
+ yield MetadataChangeProposalWrapper(
+ entityUrn=str(iolet),
+ aspect=StatusClass(removed=False),
+ )
diff --git a/metadata-ingestion/src/datahub/emitter/mcp.py b/metadata-ingestion/src/datahub/emitter/mcp.py
index 9085ac152ea0b2..d6aa695665e4ef 100644
--- a/metadata-ingestion/src/datahub/emitter/mcp.py
+++ b/metadata-ingestion/src/datahub/emitter/mcp.py
@@ -240,7 +240,7 @@ def from_obj_require_wrapper(
return mcp
def as_workunit(
- self, *, treat_errors_as_warnings: bool = False
+ self, *, treat_errors_as_warnings: bool = False, is_primary_source: bool = True
) -> "MetadataWorkUnit":
from datahub.ingestion.api.workunit import MetadataWorkUnit
@@ -254,10 +254,12 @@ def as_workunit(
id=f"{self.entityUrn}-{self.aspectName}-{ts}",
mcp=self,
treat_errors_as_warnings=treat_errors_as_warnings,
+ is_primary_source=is_primary_source,
)
return MetadataWorkUnit(
id=f"{self.entityUrn}-{self.aspectName}",
mcp=self,
treat_errors_as_warnings=treat_errors_as_warnings,
+ is_primary_source=is_primary_source,
)
diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py
index 2ce9e07bc57bc8..fae260226195ce 100644
--- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py
+++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py
@@ -17,6 +17,7 @@
from datahub.configuration.time_window_config import BaseTimeWindowConfig
from datahub.emitter.mce_builder import make_dataplatform_instance_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
+from datahub.emitter.mcp_builder import entity_supports_aspect
from datahub.ingestion.api.workunit import MetadataWorkUnit
from datahub.metadata.schema_classes import (
BrowsePathEntryClass,
@@ -64,9 +65,9 @@ def auto_status_aspect(
"""
For all entities that don't have a status aspect, add one with removed set to false.
"""
-
all_urns: Set[str] = set()
status_urns: Set[str] = set()
+ skip_urns: Set[str] = set()
for wu in stream:
urn = wu.get_urn()
all_urns.add(urn)
@@ -89,9 +90,17 @@ def auto_status_aspect(
else:
raise ValueError(f"Unexpected type {type(wu.metadata)}")
+ if not isinstance(
+ wu.metadata, MetadataChangeEventClass
+ ) and not entity_supports_aspect(wu.metadata.entityType, StatusClass):
+ # If any entity does not support aspect 'status' then skip that entity from adding status aspect.
+ # Example like dataProcessInstance doesn't suppport status aspect.
+ # If not skipped gives error: java.lang.RuntimeException: Unknown aspect status for entity dataProcessInstance
+ skip_urns.add(urn)
+
yield wu
- for urn in sorted(all_urns - status_urns):
+ for urn in sorted(all_urns - status_urns - skip_urns):
yield MetadataChangeProposalWrapper(
entityUrn=urn,
aspect=StatusClass(removed=False),
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py
new file mode 100644
index 00000000000000..b0843182c5cac4
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py
@@ -0,0 +1,145 @@
+import logging
+from dataclasses import dataclass, field as dataclass_field
+from typing import Dict, List, Optional
+
+import pydantic
+from pydantic import Field, root_validator
+
+from datahub.configuration.common import AllowDenyPattern, ConfigModel
+from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+ StaleEntityRemovalSourceReport,
+ StatefulStaleMetadataRemovalConfig,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+ StatefulIngestionConfigBase,
+)
+from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig
+
+logger = logging.getLogger(__name__)
+
+
+class Constant:
+ """
+ keys used in fivetran plugin
+ """
+
+ ORCHESTRATOR = "fivetran"
+ # table column name
+ SOURCE_SCHEMA_NAME = "source_schema_name"
+ SOURCE_TABLE_NAME = "source_table_name"
+ SOURCE_TABLE_ID = "source_table_id"
+ SOURCE_COLUMN_NAME = "source_column_name"
+ DESTINATION_SCHEMA_NAME = "destination_schema_name"
+ DESTINATION_TABLE_NAME = "destination_table_name"
+ DESTINATION_TABLE_ID = "destination_table_id"
+ DESTINATION_COLUMN_NAME = "destination_column_name"
+ SYNC_ID = "sync_id"
+ MESSAGE_DATA = "message_data"
+ TIME_STAMP = "time_stamp"
+ STATUS = "status"
+ USER_ID = "user_id"
+ GIVEN_NAME = "given_name"
+ FAMILY_NAME = "family_name"
+ CONNECTOR_ID = "connector_id"
+ CONNECTOR_NAME = "connector_name"
+ CONNECTOR_TYPE_ID = "connector_type_id"
+ PAUSED = "paused"
+ SYNC_FREQUENCY = "sync_frequency"
+ DESTINATION_ID = "destination_id"
+ CONNECTING_USER_ID = "connecting_user_id"
+ # Job status constants
+ SUCCESSFUL = "SUCCESSFUL"
+ FAILURE_WITH_TASK = "FAILURE_WITH_TASK"
+ CANCELED = "CANCELED"
+
+
+KNOWN_DATA_PLATFORM_MAPPING = {
+ "postgres": "postgres",
+ "snowflake": "snowflake",
+}
+
+
+class DestinationConfig(BaseSnowflakeConfig):
+ database: str = Field(description="The fivetran connector log database.")
+ log_schema: str = Field(description="The fivetran connector log schema.")
+
+
+class FivetranLogConfig(ConfigModel):
+ destination_platform: str = pydantic.Field(
+ default="snowflake",
+ description="The destination platform where fivetran connector log tables are dumped.",
+ )
+ destination_config: Optional[DestinationConfig] = pydantic.Field(
+ default=None,
+ description="If destination platform is 'snowflake', provide snowflake configuration.",
+ )
+
+ @root_validator(pre=True)
+ def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict:
+ destination_platform = values["destination_platform"]
+ if destination_platform == "snowflake":
+ if "destination_config" not in values:
+ raise ValueError(
+ "If destination platform is 'snowflake', user must provide snowflake destination configuration in the recipe."
+ )
+ else:
+ raise ValueError(
+ f"Destination platform '{destination_platform}' is not yet supported."
+ )
+ return values
+
+
+@dataclass
+class FivetranSourceReport(StaleEntityRemovalSourceReport):
+ connectors_scanned: int = 0
+ filtered_connectors: List[str] = dataclass_field(default_factory=list)
+
+ def report_connectors_scanned(self, count: int = 1) -> None:
+ self.connectors_scanned += count
+
+ def report_connectors_dropped(self, model: str) -> None:
+ self.filtered_connectors.append(model)
+
+
+class PlatformDetail(ConfigModel):
+ platform_instance: Optional[str] = pydantic.Field(
+ default=None,
+ description="The instance of the platform that all assets produced by this recipe belong to",
+ )
+ env: str = pydantic.Field(
+ default=DEFAULT_ENV,
+ description="The environment that all assets produced by DataHub platform ingestion source belong to",
+ )
+
+
+class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin):
+ fivetran_log_config: FivetranLogConfig = pydantic.Field(
+ description="Fivetran log connector destination server configurations.",
+ )
+ connector_patterns: AllowDenyPattern = Field(
+ default=AllowDenyPattern.allow_all(),
+ description="Regex patterns for connectors to filter in ingestion.",
+ )
+ include_column_lineage: bool = Field(
+ default=True,
+ description="Populates table->table column lineage.",
+ )
+ sources_to_database: Dict[str, str] = pydantic.Field(
+ default={},
+ description="A mapping of the connector's all sources to its database. Use connector id as key.",
+ )
+ # Configuration for stateful ingestion
+ stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field(
+ default=None, description="Airbyte Stateful Ingestion Config."
+ )
+ # Fivetran connector all sources to platform instance mapping
+ sources_to_platform_instance: Dict[str, PlatformDetail] = pydantic.Field(
+ default={},
+ description="A mapping of the connector's all sources dataset to platform instance. Use connector id as key.",
+ )
+ # Fivetran destination to platform instance mapping
+ destination_to_platform_instance: Dict[str, PlatformDetail] = pydantic.Field(
+ default={},
+ description="A mapping of destination dataset to platform instance. Use destination id as key.",
+ )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
new file mode 100644
index 00000000000000..82bb5f3467c2a6
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py
@@ -0,0 +1,36 @@
+from dataclasses import dataclass
+from typing import List
+
+
+@dataclass
+class ColumnLineage:
+ source_column: str
+ destination_column: str
+
+
+@dataclass
+class TableLineage:
+ source_table: str
+ destination_table: str
+ column_lineage: List[ColumnLineage]
+
+
+@dataclass
+class Connector:
+ connector_id: str
+ connector_name: str
+ connector_type: str
+ paused: bool
+ sync_frequency: int
+ destination_id: str
+ user_name: str
+ table_lineage: List[TableLineage]
+ jobs: List["Job"]
+
+
+@dataclass
+class Job:
+ job_id: str
+ start_time: int
+ end_time: int
+ status: str
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
new file mode 100644
index 00000000000000..c0395b4e4e7963
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py
@@ -0,0 +1,289 @@
+import logging
+from typing import Dict, Iterable, List, Optional
+
+import datahub.emitter.mce_builder as builder
+from datahub.api.entities.datajob import DataFlow, DataJob
+from datahub.api.entities.dataprocess.dataprocess_instance import (
+ DataProcessInstance,
+ InstanceRunResult,
+)
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.api.decorators import (
+ SourceCapability,
+ SupportStatus,
+ capability,
+ config_class,
+ platform_name,
+ support_status,
+)
+from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport
+from datahub.ingestion.api.workunit import MetadataWorkUnit
+from datahub.ingestion.source.fivetran.config import (
+ KNOWN_DATA_PLATFORM_MAPPING,
+ Constant,
+ FivetranSourceConfig,
+ FivetranSourceReport,
+ PlatformDetail,
+)
+from datahub.ingestion.source.fivetran.data_classes import Connector, Job
+from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI
+from datahub.ingestion.source.state.stale_entity_removal_handler import (
+ StaleEntityRemovalHandler,
+)
+from datahub.ingestion.source.state.stateful_ingestion_base import (
+ StatefulIngestionSourceBase,
+)
+from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
+ FineGrainedLineage,
+ FineGrainedLineageDownstreamType,
+ FineGrainedLineageUpstreamType,
+)
+from datahub.metadata.schema_classes import StatusClass
+from datahub.utilities.urns.data_flow_urn import DataFlowUrn
+from datahub.utilities.urns.dataset_urn import DatasetUrn
+
+# Logger instance
+logger = logging.getLogger(__name__)
+
+
+@platform_name("Fivetran")
+@config_class(FivetranSourceConfig)
+@support_status(SupportStatus.INCUBATING)
+@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default")
+@capability(
+ SourceCapability.LINEAGE_FINE,
+ "Enabled by default, can be disabled via configuration `include_column_lineage`",
+)
+class FivetranSource(StatefulIngestionSourceBase):
+ """
+ This plugin extracts fivetran users, connectors, destinations and sync history.
+ This plugin is in beta and has only been tested on Snowflake connector.
+ """
+
+ config: FivetranSourceConfig
+ report: FivetranSourceReport
+ platform: str = "fivetran"
+
+ def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext):
+ super(FivetranSource, self).__init__(config, ctx)
+ self.config = config
+ self.report = FivetranSourceReport()
+
+ self.audit_log = FivetranLogAPI(self.config.fivetran_log_config)
+
+ # Create and register the stateful ingestion use-case handler.
+ self.stale_entity_removal_handler = StaleEntityRemovalHandler.create(
+ self, self.config, self.ctx
+ )
+
+ def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None:
+ input_dataset_urn_list: List[DatasetUrn] = []
+ output_dataset_urn_list: List[DatasetUrn] = []
+ fine_grained_lineage: List[FineGrainedLineage] = []
+
+ source_platform_detail: PlatformDetail = PlatformDetail()
+ destination_platform_detail: PlatformDetail = PlatformDetail()
+ # Get platform details for connector source
+ source_platform_detail = self.config.sources_to_platform_instance.get(
+ connector.connector_id, PlatformDetail()
+ )
+
+ # Get platform details for destination
+ destination_platform_detail = self.config.destination_to_platform_instance.get(
+ connector.destination_id, PlatformDetail()
+ )
+
+ # Get database for connector source
+ # TODO: Once Fivetran exposes this, we shouldn't ask for it via config.
+ source_database: Optional[str] = self.config.sources_to_database.get(
+ connector.connector_id
+ )
+
+ if connector.connector_type in KNOWN_DATA_PLATFORM_MAPPING:
+ source_platform = KNOWN_DATA_PLATFORM_MAPPING[connector.connector_type]
+ else:
+ source_platform = connector.connector_type
+ logger.info(
+ f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity."
+ )
+
+ for table_lineage in connector.table_lineage:
+ input_dataset_urn = DatasetUrn.create_from_ids(
+ platform_id=source_platform,
+ table_name=f"{source_database.lower()}.{table_lineage.source_table}"
+ if source_database
+ else table_lineage.source_table,
+ env=source_platform_detail.env,
+ platform_instance=source_platform_detail.platform_instance,
+ )
+ input_dataset_urn_list.append(input_dataset_urn)
+
+ output_dataset_urn: Optional[DatasetUrn] = None
+ if self.audit_log.fivetran_log_database:
+ output_dataset_urn = DatasetUrn.create_from_ids(
+ platform_id=self.config.fivetran_log_config.destination_platform,
+ table_name=f"{self.audit_log.fivetran_log_database.lower()}.{table_lineage.destination_table}",
+ env=destination_platform_detail.env,
+ platform_instance=destination_platform_detail.platform_instance,
+ )
+ output_dataset_urn_list.append(output_dataset_urn)
+
+ if self.config.include_column_lineage:
+ for column_lineage in table_lineage.column_lineage:
+ fine_grained_lineage.append(
+ FineGrainedLineage(
+ upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
+ upstreams=[
+ builder.make_schema_field_urn(
+ str(input_dataset_urn),
+ column_lineage.source_column,
+ )
+ ]
+ if input_dataset_urn
+ else [],
+ downstreamType=FineGrainedLineageDownstreamType.FIELD,
+ downstreams=[
+ builder.make_schema_field_urn(
+ str(output_dataset_urn),
+ column_lineage.destination_column,
+ )
+ ]
+ if output_dataset_urn
+ else [],
+ )
+ )
+
+ datajob.inlets.extend(input_dataset_urn_list)
+ datajob.outlets.extend(output_dataset_urn_list)
+ datajob.fine_grained_lineages.extend(fine_grained_lineage)
+ return None
+
+ def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow:
+ return DataFlow(
+ orchestrator=Constant.ORCHESTRATOR,
+ id=connector.connector_id,
+ env=self.config.env,
+ name=connector.connector_name,
+ platform_instance=self.config.platform_instance,
+ )
+
+ def _generate_datajob_from_connector(self, connector: Connector) -> DataJob:
+ dataflow_urn = DataFlowUrn.create_from_ids(
+ orchestrator=Constant.ORCHESTRATOR,
+ flow_id=connector.connector_id,
+ env=self.config.env,
+ platform_instance=self.config.platform_instance,
+ )
+ datajob = DataJob(
+ id=connector.connector_id,
+ flow_urn=dataflow_urn,
+ name=connector.connector_name,
+ owners={connector.user_name},
+ )
+
+ job_property_bag: Dict[str, str] = {}
+ allowed_connection_keys = [
+ Constant.PAUSED,
+ Constant.SYNC_FREQUENCY,
+ Constant.DESTINATION_ID,
+ ]
+ for key in allowed_connection_keys:
+ if hasattr(connector, key) and getattr(connector, key) is not None:
+ job_property_bag[key] = repr(getattr(connector, key))
+ datajob.properties = job_property_bag
+
+ # Map connector source and destination table with dataset entity
+ # Also extend the fine grained lineage of column if include_column_lineage is True
+ self._extend_lineage(connector=connector, datajob=datajob)
+
+ # TODO: Add fine grained lineages of dataset after FineGrainedLineageDownstreamType.DATASET enabled
+
+ return datajob
+
+ def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance:
+ return DataProcessInstance.from_datajob(
+ datajob=datajob,
+ id=job.job_id,
+ clone_inlets=True,
+ clone_outlets=True,
+ )
+
+ def _get_dpi_workunits(
+ self, job: Job, dpi: DataProcessInstance
+ ) -> Iterable[MetadataWorkUnit]:
+ status_result_map: Dict[str, InstanceRunResult] = {
+ Constant.SUCCESSFUL: InstanceRunResult.SUCCESS,
+ Constant.FAILURE_WITH_TASK: InstanceRunResult.FAILURE,
+ Constant.CANCELED: InstanceRunResult.SKIPPED,
+ }
+ if job.status not in status_result_map:
+ logger.debug(
+ f"Status should be either SUCCESSFUL, FAILURE_WITH_TASK or CANCELED and it was "
+ f"{job.status}"
+ )
+ return []
+ result = status_result_map[job.status]
+ start_timestamp_millis = job.start_time * 1000
+ for mcp in dpi.generate_mcp(
+ created_ts_millis=start_timestamp_millis, materialize_iolets=False
+ ):
+ yield mcp.as_workunit()
+ for mcp in dpi.start_event_mcp(start_timestamp_millis):
+ yield mcp.as_workunit()
+ for mcp in dpi.end_event_mcp(
+ end_timestamp_millis=job.end_time * 1000,
+ result=result,
+ result_type=Constant.ORCHESTRATOR,
+ ):
+ yield mcp.as_workunit()
+
+ def _get_connector_workunits(
+ self, connector: Connector
+ ) -> Iterable[MetadataWorkUnit]:
+ self.report.report_connectors_scanned()
+ # Create dataflow entity with same name as connector name
+ dataflow = self._generate_dataflow_from_connector(connector)
+ for mcp in dataflow.generate_mcp():
+ yield mcp.as_workunit()
+
+ # Map Fivetran's connector entity with Datahub's datajob entity
+ datajob = self._generate_datajob_from_connector(connector)
+ for mcp in datajob.generate_mcp(materialize_iolets=True):
+ if mcp.entityType == "dataset" and isinstance(mcp.aspect, StatusClass):
+ # While we "materialize" the referenced datasets, we don't want them
+ # to be tracked by stateful ingestion.
+ yield mcp.as_workunit(is_primary_source=False)
+ else:
+ yield mcp.as_workunit()
+
+ # Map Fivetran's job/sync history entity with Datahub's data process entity
+ for job in connector.jobs:
+ dpi = self._generate_dpi_from_job(job, datajob)
+ yield from self._get_dpi_workunits(job, dpi)
+
+ @classmethod
+ def create(cls, config_dict: dict, ctx: PipelineContext) -> Source:
+ config = FivetranSourceConfig.parse_obj(config_dict)
+ return cls(config, ctx)
+
+ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
+ return [
+ *super().get_workunit_processors(),
+ self.stale_entity_removal_handler.workunit_processor,
+ ]
+
+ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
+ """
+ Datahub Ingestion framework invoke this method
+ """
+ logger.info("Fivetran plugin execution is started")
+ connectors = self.audit_log.get_connectors_list()
+ for connector in connectors:
+ if not self.config.connector_patterns.allowed(connector.connector_name):
+ self.report.report_connectors_dropped(connector.connector_name)
+ continue
+ logger.info(f"Processing connector id: {connector.connector_id}")
+ yield from self._get_connector_workunits(connector)
+
+ def get_report(self) -> SourceReport:
+ return self.report
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
new file mode 100644
index 00000000000000..d5d146559d9183
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py
@@ -0,0 +1,147 @@
+import json
+import logging
+from typing import Any, Dict, List, Optional
+
+from sqlalchemy import create_engine
+
+from datahub.ingestion.source.fivetran.config import Constant, FivetranLogConfig
+from datahub.ingestion.source.fivetran.data_classes import (
+ ColumnLineage,
+ Connector,
+ Job,
+ TableLineage,
+)
+from datahub.ingestion.source.fivetran.fivetran_query import FivetranLogQuery
+
+logger: logging.Logger = logging.getLogger(__name__)
+
+
+class FivetranLogAPI:
+ def __init__(self, fivetran_log_config: FivetranLogConfig) -> None:
+ self.fivetran_log_database: Optional[str] = None
+ self.fivetran_log_config = fivetran_log_config
+ self.engine = self._get_log_destination_engine()
+
+ def _get_log_destination_engine(self) -> Any:
+ destination_platform = self.fivetran_log_config.destination_platform
+ engine = None
+ # For every destination, create sqlalchemy engine,
+ # select the database and schema and set fivetran_log_database class variable
+ if destination_platform == "snowflake":
+ snowflake_destination_config = self.fivetran_log_config.destination_config
+ if snowflake_destination_config is not None:
+ engine = create_engine(
+ snowflake_destination_config.get_sql_alchemy_url(),
+ **snowflake_destination_config.get_options(),
+ )
+ engine.execute(
+ FivetranLogQuery.use_schema(
+ snowflake_destination_config.database,
+ snowflake_destination_config.log_schema,
+ )
+ )
+ self.fivetran_log_database = snowflake_destination_config.database
+ return engine
+
+ def _query(self, query: str) -> List[Dict]:
+ logger.debug("Query : {}".format(query))
+ resp = self.engine.execute(query)
+ return [row for row in resp]
+
+ def _get_table_lineage(self, connector_id: str) -> List[TableLineage]:
+ table_lineage_result = self._query(
+ FivetranLogQuery.get_table_lineage_query(connector_id=connector_id)
+ )
+ table_lineage_list: List[TableLineage] = []
+ for table_lineage in table_lineage_result:
+ column_lineage_result = self._query(
+ FivetranLogQuery.get_column_lineage_query(
+ source_table_id=table_lineage[Constant.SOURCE_TABLE_ID],
+ destination_table_id=table_lineage[Constant.DESTINATION_TABLE_ID],
+ )
+ )
+ column_lineage_list: List[ColumnLineage] = [
+ ColumnLineage(
+ source_column=column_lineage[Constant.SOURCE_COLUMN_NAME],
+ destination_column=column_lineage[Constant.DESTINATION_COLUMN_NAME],
+ )
+ for column_lineage in column_lineage_result
+ ]
+ table_lineage_list.append(
+ TableLineage(
+ source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}",
+ destination_table=f"{table_lineage[Constant.DESTINATION_SCHEMA_NAME]}.{table_lineage[Constant.DESTINATION_TABLE_NAME]}",
+ column_lineage=column_lineage_list,
+ )
+ )
+
+ return table_lineage_list
+
+ def _get_jobs_list(self, connector_id: str) -> List[Job]:
+ jobs: List[Job] = []
+ sync_start_logs = {
+ row[Constant.SYNC_ID]: row
+ for row in self._query(
+ FivetranLogQuery.get_sync_start_logs_query(connector_id=connector_id)
+ )
+ }
+ sync_end_logs = {
+ row[Constant.SYNC_ID]: row
+ for row in self._query(
+ FivetranLogQuery.get_sync_end_logs_query(connector_id=connector_id)
+ )
+ }
+ for sync_id in sync_start_logs.keys():
+ if sync_end_logs.get(sync_id) is None:
+ # If no sync-end event log for this sync id that means sync is still in progress
+ continue
+
+ message_data = json.loads(sync_end_logs[sync_id][Constant.MESSAGE_DATA])
+ if isinstance(message_data, str):
+ # Sometimes message_data contains json string inside string
+ # Ex: '"{\"status\":\"SUCCESSFUL\"}"'
+ # Hence, need to do json loads twice.
+ message_data = json.loads(message_data)
+
+ jobs.append(
+ Job(
+ job_id=sync_id,
+ start_time=round(
+ sync_start_logs[sync_id][Constant.TIME_STAMP].timestamp()
+ ),
+ end_time=round(
+ sync_end_logs[sync_id][Constant.TIME_STAMP].timestamp()
+ ),
+ status=message_data[Constant.STATUS],
+ )
+ )
+ return jobs
+
+ def _get_user_name(self, user_id: str) -> str:
+ user_details = self._query(FivetranLogQuery.get_user_query(user_id=user_id))[0]
+ return (
+ f"{user_details[Constant.GIVEN_NAME]} {user_details[Constant.FAMILY_NAME]}"
+ )
+
+ def get_connectors_list(self) -> List[Connector]:
+ connectors: List[Connector] = []
+ connector_list = self._query(FivetranLogQuery.get_connectors_query())
+ for connector in connector_list:
+ connectors.append(
+ Connector(
+ connector_id=connector[Constant.CONNECTOR_ID],
+ connector_name=connector[Constant.CONNECTOR_NAME],
+ connector_type=connector[Constant.CONNECTOR_TYPE_ID],
+ paused=connector[Constant.PAUSED],
+ sync_frequency=connector[Constant.SYNC_FREQUENCY],
+ destination_id=connector[Constant.DESTINATION_ID],
+ user_name=self._get_user_name(
+ connector[Constant.CONNECTING_USER_ID]
+ ),
+ table_lineage=self._get_table_lineage(
+ connector[Constant.CONNECTOR_ID]
+ ),
+ jobs=self._get_jobs_list(connector[Constant.CONNECTOR_ID]),
+ )
+ )
+ return connectors
diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
new file mode 100644
index 00000000000000..4f52fcd5d884fb
--- /dev/null
+++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py
@@ -0,0 +1,76 @@
+class FivetranLogQuery:
+ @staticmethod
+ def use_schema(db_name: str, schema_name: str) -> str:
+ return f'use schema "{db_name}"."{schema_name}"'
+
+ @staticmethod
+ def get_connectors_query() -> str:
+ return """
+ SELECT connector_id as "CONNECTOR_ID",
+ connecting_user_id as "CONNECTING_USER_ID",
+ connector_type_id as "CONNECTOR_TYPE_ID",
+ connector_name as "CONNECTOR_NAME",
+ paused as "PAUSED",
+ sync_frequency as "SYNC_FREQUENCY",
+ destination_id as "DESTINATION_ID"
+ FROM CONNECTOR
+ WHERE _fivetran_deleted = FALSE"""
+
+ @staticmethod
+ def get_user_query(user_id: str) -> str:
+ return f"""
+ SELECT id as "USER_ID",
+ given_name as "GIVEN_NAME",
+ family_name as "FAMILY_NAME"
+ FROM USER
+ WHERE id = '{user_id}'"""
+
+ @staticmethod
+ def get_sync_start_logs_query(
+ connector_id: str,
+ ) -> str:
+ return f"""
+ SELECT time_stamp as "TIME_STAMP",
+ sync_id as "SYNC_ID"
+ FROM LOG
+ WHERE message_event = 'sync_start'
+ and connector_id = '{connector_id}' order by time_stamp"""
+
+ @staticmethod
+ def get_sync_end_logs_query(connector_id: str) -> str:
+ return f"""
+ SELECT time_stamp as "TIME_STAMP",
+ sync_id as "SYNC_ID",
+ message_data as "MESSAGE_DATA"
+ FROM LOG
+ WHERE message_event = 'sync_end'
+ and connector_id = '{connector_id}' order by time_stamp"""
+
+ @staticmethod
+ def get_table_lineage_query(connector_id: str) -> str:
+ return f"""
+ SELECT stm.id as "SOURCE_TABLE_ID",
+ stm.name as "SOURCE_TABLE_NAME",
+ ssm.name as "SOURCE_SCHEMA_NAME",
+ dtm.id as "DESTINATION_TABLE_ID",
+ dtm.name as "DESTINATION_TABLE_NAME",
+ dsm.name as "DESTINATION_SCHEMA_NAME"
+ FROM table_lineage as tl
+ JOIN source_table_metadata as stm on tl.source_table_id = stm.id
+ JOIN destination_table_metadata as dtm on tl.destination_table_id = dtm.id
+ JOIN source_schema_metadata as ssm on stm.schema_id = ssm.id
+ JOIN destination_schema_metadata as dsm on dtm.schema_id = dsm.id
+ WHERE stm.connector_id = '{connector_id}'"""
+
+ @staticmethod
+ def get_column_lineage_query(
+ source_table_id: str, destination_table_id: str
+ ) -> str:
+ return f"""
+ SELECT scm.name as "SOURCE_COLUMN_NAME",
+ dcm.name as "DESTINATION_COLUMN_NAME"
+ FROM column_lineage as cl
+ JOIN source_column_metadata as scm on
+ (cl.source_column_id = scm.id and scm.table_id = {source_table_id})
+ JOIN destination_column_metadata as dcm on
+ (cl.destination_column_id = dcm.id and dcm.table_id = {destination_table_id})"""
diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
index c3e8c175f1de54..9fc697018ecd6b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
+++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py
@@ -12,7 +12,7 @@
OAUTH_AUTHENTICATOR,
)
-from datahub.configuration.common import AllowDenyPattern
+from datahub.configuration.common import AllowDenyPattern, ConfigModel
from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider
from datahub.configuration.time_window_config import BaseTimeWindowConfig
from datahub.configuration.validate_field_rename import pydantic_renamed_field
@@ -42,9 +42,14 @@
SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com"
-class BaseSnowflakeConfig(BaseTimeWindowConfig):
+class BaseSnowflakeConfig(ConfigModel):
# Note: this config model is also used by the snowflake-usage source.
+ options: dict = pydantic.Field(
+ default_factory=dict,
+ description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.",
+ )
+
scheme: str = "snowflake"
username: Optional[str] = pydantic.Field(
default=None, description="Snowflake username."
@@ -82,14 +87,6 @@ class BaseSnowflakeConfig(BaseTimeWindowConfig):
default=None, description="Snowflake warehouse."
)
role: Optional[str] = pydantic.Field(default=None, description="Snowflake role.")
- include_table_lineage: bool = pydantic.Field(
- default=True,
- description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.",
- )
- include_view_lineage: bool = pydantic.Field(
- default=True,
- description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.",
- )
connect_args: Optional[Dict[str, Any]] = pydantic.Field(
default=None,
description="Connect args to pass to Snowflake SqlAlchemy driver",
@@ -166,18 +163,6 @@ def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None:
"but should be set when using use_certificate false for oauth_config"
)
- @pydantic.root_validator()
- def validate_include_view_lineage(cls, values):
- if (
- "include_table_lineage" in values
- and not values.get("include_table_lineage")
- and values.get("include_view_lineage")
- ):
- raise ValueError(
- "include_table_lineage must be True for include_view_lineage to be set."
- )
- return values
-
def get_sql_alchemy_url(
self,
database: Optional[str] = None,
@@ -261,28 +246,8 @@ def get_connect_args(self) -> dict:
self._computed_connect_args = connect_args
return connect_args
-
-class SnowflakeConfig(BaseSnowflakeConfig, SQLCommonConfig):
- database_pattern: AllowDenyPattern = AllowDenyPattern(
- deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"]
- )
-
- ignore_start_time_lineage: bool = False
- upstream_lineage_in_report: bool = False
-
- def get_sql_alchemy_url(
- self,
- database: Optional[str] = None,
- username: Optional[str] = None,
- password: Optional[pydantic.SecretStr] = None,
- role: Optional[str] = None,
- ) -> str:
- return super().get_sql_alchemy_url(
- database=database, username=username, password=password, role=role
- )
-
def get_options(self) -> dict:
- options_connect_args: Dict = super().get_connect_args()
+ options_connect_args: Dict = self.get_connect_args()
options_connect_args.update(self.options.get("connect_args", {}))
self.options["connect_args"] = options_connect_args
return self.options
@@ -372,3 +337,34 @@ def get_connection(self) -> snowflake.connector.SnowflakeConnection:
else:
# not expected to be here
raise Exception("Not expected to be here.")
+
+
+class SnowflakeConfig(BaseSnowflakeConfig, BaseTimeWindowConfig, SQLCommonConfig):
+
+ include_table_lineage: bool = pydantic.Field(
+ default=True,
+ description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.",
+ )
+ include_view_lineage: bool = pydantic.Field(
+ default=True,
+ description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.",
+ )
+
+ database_pattern: AllowDenyPattern = AllowDenyPattern(
+ deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"]
+ )
+
+ ignore_start_time_lineage: bool = False
+ upstream_lineage_in_report: bool = False
+
+ @pydantic.root_validator()
+ def validate_include_view_lineage(cls, values):
+ if (
+ "include_table_lineage" in values
+ and not values.get("include_table_lineage")
+ and values.get("include_view_lineage")
+ ):
+ raise ValueError(
+ "include_table_lineage must be True for include_view_lineage to be set."
+ )
+ return values
diff --git a/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json
new file mode 100644
index 00000000000000..a72c960a722969
--- /dev/null
+++ b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json
@@ -0,0 +1,658 @@
+[
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "dataFlowInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "name": "postgres"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "ownership",
+ "aspect": {
+ "json": {
+ "owners": [],
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:fivetran"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "globalTags",
+ "aspect": {
+ "json": {
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInfo",
+ "aspect": {
+ "json": {
+ "customProperties": {
+ "paused": "False",
+ "sync_frequency": "1440",
+ "destination_id": "'interval_unconstitutional'"
+ },
+ "name": "postgres",
+ "type": {
+ "string": "COMMAND"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)",
+ "changeType": "UPSERT",
+ "aspectName": "dataJobInputOutput",
+ "aspect": {
+ "json": {
+ "inputDatasets": [
+ "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)",
+ "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)"
+ ],
+ "outputDatasets": [
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)"
+ ],
+ "inputDatajobs": [],
+ "fineGrainedLineages": [
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV),id)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD),id)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV),name)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD),name)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV),id)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD),id)"
+ ],
+ "confidenceScore": 1.0
+ },
+ {
+ "upstreamType": "FIELD_SET",
+ "upstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV),name)"
+ ],
+ "downstreamType": "FIELD",
+ "downstreams": [
+ "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD),name)"
+ ],
+ "confidenceScore": 1.0
+ }
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataset",
+ "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)",
+ "changeType": "UPSERT",
+ "aspectName": "ownership",
+ "aspect": {
+ "json": {
+ "owners": [
+ {
+ "owner": "urn:li:corpuser:Shubham Jagtap",
+ "type": "DEVELOPER",
+ "source": {
+ "type": "SERVICE"
+ }
+ }
+ ],
+ "lastModified": {
+ "time": 0,
+ "actor": "urn:li:corpuser:fivetran"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)",
+ "changeType": "UPSERT",
+ "aspectName": "globalTags",
+ "aspect": {
+ "json": {
+ "tags": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "name": "4c9a03d6-eded-4422-a46a-163266e58243",
+ "type": "BATCH_SCHEDULED",
+ "created": {
+ "time": 1695191853000,
+ "actor": "urn:li:corpuser:datahub"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceRelationships",
+ "aspect": {
+ "json": {
+ "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)",
+ "upstreamInstances": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceInput",
+ "aspect": {
+ "json": {
+ "inputs": [
+ "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)",
+ "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceOutput",
+ "aspect": {
+ "json": {
+ "outputs": [
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceRunEvent",
+ "aspect": {
+ "json": {
+ "timestampMillis": 1695191853000,
+ "partitionSpec": {
+ "type": "FULL_TABLE",
+ "partition": "FULL_TABLE_SNAPSHOT"
+ },
+ "status": "STARTED"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceRunEvent",
+ "aspect": {
+ "json": {
+ "timestampMillis": 1695191885000,
+ "partitionSpec": {
+ "type": "FULL_TABLE",
+ "partition": "FULL_TABLE_SNAPSHOT"
+ },
+ "status": "COMPLETE",
+ "result": {
+ "type": "SUCCESS",
+ "nativeResultType": "fivetran"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "name": "f773d1e9-c791-48f4-894f-8cf9b3dfc834",
+ "type": "BATCH_SCHEDULED",
+ "created": {
+ "time": 1696343730000,
+ "actor": "urn:li:corpuser:datahub"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceRelationships",
+ "aspect": {
+ "json": {
+ "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)",
+ "upstreamInstances": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceInput",
+ "aspect": {
+ "json": {
+ "inputs": [
+ "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)",
+ "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceOutput",
+ "aspect": {
+ "json": {
+ "outputs": [
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceRunEvent",
+ "aspect": {
+ "json": {
+ "timestampMillis": 1696343730000,
+ "partitionSpec": {
+ "type": "FULL_TABLE",
+ "partition": "FULL_TABLE_SNAPSHOT"
+ },
+ "status": "STARTED"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceRunEvent",
+ "aspect": {
+ "json": {
+ "timestampMillis": 1696343732000,
+ "partitionSpec": {
+ "type": "FULL_TABLE",
+ "partition": "FULL_TABLE_SNAPSHOT"
+ },
+ "status": "COMPLETE",
+ "result": {
+ "type": "SKIPPED",
+ "nativeResultType": "fivetran"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceProperties",
+ "aspect": {
+ "json": {
+ "customProperties": {},
+ "name": "63c2fc85-600b-455f-9ba0-f576522465be",
+ "type": "BATCH_SCHEDULED",
+ "created": {
+ "time": 1696343755000,
+ "actor": "urn:li:corpuser:datahub"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceRelationships",
+ "aspect": {
+ "json": {
+ "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)",
+ "upstreamInstances": []
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceInput",
+ "aspect": {
+ "json": {
+ "inputs": [
+ "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)",
+ "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceOutput",
+ "aspect": {
+ "json": {
+ "outputs": [
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)",
+ "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)"
+ ]
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceRunEvent",
+ "aspect": {
+ "json": {
+ "timestampMillis": 1696343755000,
+ "partitionSpec": {
+ "type": "FULL_TABLE",
+ "partition": "FULL_TABLE_SNAPSHOT"
+ },
+ "status": "STARTED"
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataProcessInstance",
+ "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d",
+ "changeType": "UPSERT",
+ "aspectName": "dataProcessInstanceRunEvent",
+ "aspect": {
+ "json": {
+ "timestampMillis": 1696343790000,
+ "partitionSpec": {
+ "type": "FULL_TABLE",
+ "partition": "FULL_TABLE_SNAPSHOT"
+ },
+ "status": "COMPLETE",
+ "result": {
+ "type": "FAILURE",
+ "nativeResultType": "fivetran"
+ }
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataFlow",
+ "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+},
+{
+ "entityType": "dataJob",
+ "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)",
+ "changeType": "UPSERT",
+ "aspectName": "status",
+ "aspect": {
+ "json": {
+ "removed": false
+ }
+ },
+ "systemMetadata": {
+ "lastObserved": 1654621200000,
+ "runId": "powerbi-test",
+ "lastRunId": "no-run-id-provided"
+ }
+}
+]
\ No newline at end of file
diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
new file mode 100644
index 00000000000000..62b3df12e1b9d3
--- /dev/null
+++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py
@@ -0,0 +1,192 @@
+import datetime
+from unittest import mock
+from unittest.mock import MagicMock
+
+import pytest
+from freezegun import freeze_time
+
+from datahub.ingestion.run.pipeline import Pipeline
+from datahub.ingestion.source.fivetran.config import DestinationConfig
+from datahub.ingestion.source.fivetran.fivetran_query import FivetranLogQuery
+from tests.test_helpers import mce_helpers
+
+FROZEN_TIME = "2022-06-07 17:00:00"
+
+
+def default_query_results(query):
+ if query == FivetranLogQuery.use_schema("TEST_DATABASE", "TEST_SCHEMA"):
+ return []
+ elif query == FivetranLogQuery.get_connectors_query():
+ return [
+ {
+ "connector_id": "calendar_elected",
+ "connecting_user_id": "reapply_phone",
+ "connector_type_id": "postgres",
+ "connector_name": "postgres",
+ "paused": False,
+ "sync_frequency": 1440,
+ "destination_id": "interval_unconstitutional",
+ },
+ ]
+ elif query == FivetranLogQuery.get_table_lineage_query("calendar_elected"):
+ return [
+ {
+ "source_table_id": "10040",
+ "source_table_name": "employee",
+ "source_schema_name": "public",
+ "destination_table_id": "7779",
+ "destination_table_name": "employee",
+ "destination_schema_name": "postgres_public",
+ },
+ {
+ "source_table_id": "10041",
+ "source_table_name": "company",
+ "source_schema_name": "public",
+ "destination_table_id": "7780",
+ "destination_table_name": "company",
+ "destination_schema_name": "postgres_public",
+ },
+ ]
+ elif query == FivetranLogQuery.get_column_lineage_query(
+ "10040", "7779"
+ ) or query == FivetranLogQuery.get_column_lineage_query("10041", "7780"):
+ return [
+ {
+ "source_column_name": "id",
+ "destination_column_name": "id",
+ },
+ {
+ "source_column_name": "name",
+ "destination_column_name": "name",
+ },
+ ]
+ elif query == FivetranLogQuery.get_user_query("reapply_phone"):
+ return [
+ {
+ "user_id": "reapply_phone",
+ "given_name": "Shubham",
+ "family_name": "Jagtap",
+ }
+ ]
+ elif query == FivetranLogQuery.get_sync_start_logs_query("calendar_elected"):
+ return [
+ {
+ "time_stamp": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000),
+ "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243",
+ },
+ {
+ "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000),
+ "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834",
+ },
+ {
+ "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000),
+ "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be",
+ },
+ ]
+ elif query == FivetranLogQuery.get_sync_end_logs_query("calendar_elected"):
+ return [
+ {
+ "time_stamp": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000),
+ "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243",
+ "message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"',
+ },
+ {
+ "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000),
+ "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834",
+ "message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"',
+ },
+ {
+ "time_stamp": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000),
+ "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be",
+ "message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"',
+ },
+ ]
+ # Unreachable code
+ raise Exception(f"Unknown query {query}")
+
+
+@freeze_time(FROZEN_TIME)
+@pytest.mark.integration
+def test_fivetran_basic(pytestconfig, tmp_path):
+ test_resources_dir = pytestconfig.rootpath / "tests/integration/fivetran"
+
+ # Run the metadata ingestion pipeline.
+ output_file = tmp_path / "fivetran_test_events.json"
+ golden_file = test_resources_dir / "fivetran_golden.json"
+
+ with mock.patch(
+ "datahub.ingestion.source.fivetran.fivetran_log_api.create_engine"
+ ) as mock_create_engine:
+ connection_magic_mock = MagicMock()
+ connection_magic_mock.execute.side_effect = default_query_results
+
+ mock_create_engine.return_value = connection_magic_mock
+
+ pipeline = Pipeline.create(
+ {
+ "run_id": "powerbi-test",
+ "source": {
+ "type": "fivetran",
+ "config": {
+ "fivetran_log_config": {
+ "destination_platform": "snowflake",
+ "destination_config": {
+ "account_id": "TESTID",
+ "warehouse": "TEST_WH",
+ "username": "test",
+ "password": "test@123",
+ "database": "TEST_DATABASE",
+ "role": "TESTROLE",
+ "log_schema": "TEST_SCHEMA",
+ },
+ },
+ "connector_patterns": {
+ "allow": [
+ "postgres",
+ ]
+ },
+ "sources_to_database": {
+ "calendar_elected": "postgres_db",
+ },
+ "sources_to_platform_instance": {
+ "calendar_elected": {
+ "env": "DEV",
+ }
+ },
+ },
+ },
+ "sink": {
+ "type": "file",
+ "config": {
+ "filename": f"{output_file}",
+ },
+ },
+ }
+ )
+
+ pipeline.run()
+ pipeline.raise_from_status()
+ golden_file = "fivetran_golden.json"
+
+ mce_helpers.check_golden_file(
+ pytestconfig,
+ output_path=f"{output_file}",
+ golden_path=f"{test_resources_dir}/{golden_file}",
+ )
+
+
+@freeze_time(FROZEN_TIME)
+def test_fivetran_snowflake_destination_config(pytestconfig, tmp_path):
+ snowflake_dest = DestinationConfig(
+ account_id="TESTID",
+ warehouse="TEST_WH",
+ username="test",
+ password="test@123",
+ database="TEST_DATABASE",
+ role="TESTROLE",
+ log_schema="TEST_SCHEMA",
+ )
+ assert (
+ snowflake_dest.get_sql_alchemy_url()
+ == "snowflake://test:test%40123@TESTID?application=acryl_datahub&authenticator=SNOWFLAKE&role=TESTROLE&warehouse=TEST_WH"
+ )
diff --git a/metadata-service/war/src/main/resources/boot/data_platforms.json b/metadata-service/war/src/main/resources/boot/data_platforms.json
index 3d956c5774dedb..3c70eda8561b86 100644
--- a/metadata-service/war/src/main/resources/boot/data_platforms.json
+++ b/metadata-service/war/src/main/resources/boot/data_platforms.json
@@ -564,5 +564,15 @@
"type": "KEY_VALUE_STORE",
"logoUrl": "/assets/platforms/dynamodblogo.png"
}
+ },
+ {
+ "urn": "urn:li:dataPlatform:fivetran",
+ "aspect": {
+ "datasetNameDelimiter": ".",
+ "name": "fivetran",
+ "displayName": "Fivetran",
+ "type": "OTHERS",
+ "logoUrl": "/assets/platforms/fivetranlogo.png"
+ }
}
]
From 399e032dfa2b4bf87b7b406e7b009e34e99a1003 Mon Sep 17 00:00:00 2001
From: deepgarg-visa <149145061+deepgarg-visa@users.noreply.github.com>
Date: Wed, 8 Nov 2023 22:32:13 +0530
Subject: [PATCH 25/33] feat(neo4j): Allow datahub to connect to specific neo4j
database (#9179)
Co-authored-by: david-leifker <114954101+david-leifker@users.noreply.github.com>
---
docker/docker-compose.override.yml | 4 ++
.../docker-compose-m1.quickstart.yml | 54 +++++++++----------
.../quickstart/docker-compose.quickstart.yml | 54 +++++++++----------
.../src/main/resources/application.yml | 1 +
.../common/Neo4jGraphServiceFactory.java | 7 ++-
5 files changed, 65 insertions(+), 55 deletions(-)
diff --git a/docker/docker-compose.override.yml b/docker/docker-compose.override.yml
index 225aa01fa4e4f8..0907f47d70c3ce 100644
--- a/docker/docker-compose.override.yml
+++ b/docker/docker-compose.override.yml
@@ -7,8 +7,12 @@ services:
environment:
- DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart}
- DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true}
+ - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch}
volumes:
- ${HOME}/.datahub/plugins:/etc/datahub/plugins
+ datahub-upgrade:
+ environment:
+ - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch}
mysql-setup:
container_name: mysql-setup
hostname: mysql-setup
diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml
index c96baf37551b29..613718306abef6 100644
--- a/docker/quickstart/docker-compose-m1.quickstart.yml
+++ b/docker/quickstart/docker-compose-m1.quickstart.yml
@@ -81,32 +81,32 @@ services:
- DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart}
- DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true}
- DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-gms
- - EBEAN_DATASOURCE_USERNAME=datahub
- - EBEAN_DATASOURCE_PASSWORD=datahub
+ - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- EBEAN_DATASOURCE_HOST=mysql:3306
+ - EBEAN_DATASOURCE_PASSWORD=datahub
- EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8&enabledTLSProtocols=TLSv1.2
- - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- - KAFKA_BOOTSTRAP_SERVER=broker:29092
- - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
+ - EBEAN_DATASOURCE_USERNAME=datahub
- ELASTICSEARCH_HOST=elasticsearch
- - ELASTICSEARCH_PORT=9200
- - ES_BULK_REFRESH_POLICY=WAIT_UNTIL
- - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- - NEO4J_HOST=http://neo4j:7474
- - NEO4J_URI=bolt://neo4j
- - NEO4J_USERNAME=neo4j
- - NEO4J_PASSWORD=datahub
- - JAVA_OPTS=-Xms1g -Xmx1g
- - GRAPH_SERVICE_DIFF_MODE_ENABLED=true
- - GRAPH_SERVICE_IMPL=neo4j
+ - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
+ - ELASTICSEARCH_PORT=9200
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- ENTITY_SERVICE_ENABLE_RETENTION=true
+ - ES_BULK_REFRESH_POLICY=WAIT_UNTIL
+ - GRAPH_SERVICE_DIFF_MODE_ENABLED=true
+ - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch}
+ - JAVA_OPTS=-Xms1g -Xmx1g
+ - KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- MAE_CONSUMER_ENABLED=true
- MCE_CONSUMER_ENABLED=true
+ - METADATA_SERVICE_AUTH_ENABLED=false
+ - NEO4J_HOST=http://neo4j:7474
+ - NEO4J_PASSWORD=datahub
+ - NEO4J_URI=bolt://neo4j
+ - NEO4J_USERNAME=neo4j
- PE_CONSUMER_ENABLED=true
- UI_INGESTION_ENABLED=true
- - METADATA_SERVICE_AUTH_ENABLED=false
healthcheck:
interval: 1s
retries: 3
@@ -134,23 +134,23 @@ services:
neo4j:
condition: service_healthy
environment:
- - EBEAN_DATASOURCE_USERNAME=datahub
- - EBEAN_DATASOURCE_PASSWORD=datahub
+ - BACKFILL_BROWSE_PATHS_V2=true
+ - DATAHUB_GMS_HOST=datahub-gms
+ - DATAHUB_GMS_PORT=8080
+ - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- EBEAN_DATASOURCE_HOST=mysql:3306
+ - EBEAN_DATASOURCE_PASSWORD=datahub
- EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8
- - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- - KAFKA_BOOTSTRAP_SERVER=broker:29092
- - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
+ - EBEAN_DATASOURCE_USERNAME=datahub
+ - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false
- ELASTICSEARCH_HOST=elasticsearch
- - ELASTICSEARCH_PORT=9200
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
- - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false
- - GRAPH_SERVICE_IMPL=elasticsearch
- - DATAHUB_GMS_HOST=datahub-gms
- - DATAHUB_GMS_PORT=8080
+ - ELASTICSEARCH_PORT=9200
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- - BACKFILL_BROWSE_PATHS_V2=true
+ - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch}
+ - KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- REPROCESS_DEFAULT_BROWSE_PATHS_V2=false
hostname: datahub-upgrade
image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head}
diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml
index 8a66521cbb5221..30ccbae59be740 100644
--- a/docker/quickstart/docker-compose.quickstart.yml
+++ b/docker/quickstart/docker-compose.quickstart.yml
@@ -81,32 +81,32 @@ services:
- DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart}
- DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true}
- DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-gms
- - EBEAN_DATASOURCE_USERNAME=datahub
- - EBEAN_DATASOURCE_PASSWORD=datahub
+ - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- EBEAN_DATASOURCE_HOST=mysql:3306
+ - EBEAN_DATASOURCE_PASSWORD=datahub
- EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8&enabledTLSProtocols=TLSv1.2
- - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- - KAFKA_BOOTSTRAP_SERVER=broker:29092
- - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
+ - EBEAN_DATASOURCE_USERNAME=datahub
- ELASTICSEARCH_HOST=elasticsearch
- - ELASTICSEARCH_PORT=9200
- - ES_BULK_REFRESH_POLICY=WAIT_UNTIL
- - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- - NEO4J_HOST=http://neo4j:7474
- - NEO4J_URI=bolt://neo4j
- - NEO4J_USERNAME=neo4j
- - NEO4J_PASSWORD=datahub
- - JAVA_OPTS=-Xms1g -Xmx1g
- - GRAPH_SERVICE_DIFF_MODE_ENABLED=true
- - GRAPH_SERVICE_IMPL=neo4j
+ - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
+ - ELASTICSEARCH_PORT=9200
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- ENTITY_SERVICE_ENABLE_RETENTION=true
+ - ES_BULK_REFRESH_POLICY=WAIT_UNTIL
+ - GRAPH_SERVICE_DIFF_MODE_ENABLED=true
+ - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch}
+ - JAVA_OPTS=-Xms1g -Xmx1g
+ - KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- MAE_CONSUMER_ENABLED=true
- MCE_CONSUMER_ENABLED=true
+ - METADATA_SERVICE_AUTH_ENABLED=false
+ - NEO4J_HOST=http://neo4j:7474
+ - NEO4J_PASSWORD=datahub
+ - NEO4J_URI=bolt://neo4j
+ - NEO4J_USERNAME=neo4j
- PE_CONSUMER_ENABLED=true
- UI_INGESTION_ENABLED=true
- - METADATA_SERVICE_AUTH_ENABLED=false
healthcheck:
interval: 1s
retries: 3
@@ -134,23 +134,23 @@ services:
neo4j:
condition: service_healthy
environment:
- - EBEAN_DATASOURCE_USERNAME=datahub
- - EBEAN_DATASOURCE_PASSWORD=datahub
+ - BACKFILL_BROWSE_PATHS_V2=true
+ - DATAHUB_GMS_HOST=datahub-gms
+ - DATAHUB_GMS_PORT=8080
+ - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- EBEAN_DATASOURCE_HOST=mysql:3306
+ - EBEAN_DATASOURCE_PASSWORD=datahub
- EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8
- - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
- - KAFKA_BOOTSTRAP_SERVER=broker:29092
- - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
+ - EBEAN_DATASOURCE_USERNAME=datahub
+ - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false
- ELASTICSEARCH_HOST=elasticsearch
- - ELASTICSEARCH_PORT=9200
- ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
- ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
- - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false
- - GRAPH_SERVICE_IMPL=elasticsearch
- - DATAHUB_GMS_HOST=datahub-gms
- - DATAHUB_GMS_PORT=8080
+ - ELASTICSEARCH_PORT=9200
- ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml
- - BACKFILL_BROWSE_PATHS_V2=true
+ - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch}
+ - KAFKA_BOOTSTRAP_SERVER=broker:29092
+ - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
- REPROCESS_DEFAULT_BROWSE_PATHS_V2=false
hostname: datahub-upgrade
image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head}
diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml
index a06891699607bb..46aa02d98572e6 100644
--- a/metadata-service/configuration/src/main/resources/application.yml
+++ b/metadata-service/configuration/src/main/resources/application.yml
@@ -251,6 +251,7 @@ neo4j:
username: ${NEO4J_USERNAME:neo4j}
password: ${NEO4J_PASSWORD:datahub}
uri: ${NEO4J_URI:bolt://localhost}
+ database: ${NEO4J_DATABASE:graph.db}
maxConnectionPoolSize: ${NEO4J_MAX_CONNECTION_POOL_SIZE:100}
maxConnectionAcquisitionTimeout: ${NEO4J_MAX_CONNECTION_ACQUISITION_TIMEOUT_IN_SECONDS:60}
maxConnectionLifetimeInSeconds: ${NEO4j_MAX_CONNECTION_LIFETIME_IN_SECONDS:3600}
diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java
index e62dfd50f897d7..87670ce10f481a 100644
--- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java
+++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java
@@ -6,8 +6,10 @@
import com.linkedin.metadata.models.registry.EntityRegistry;
import javax.annotation.Nonnull;
import org.neo4j.driver.Driver;
+import org.neo4j.driver.SessionConfig;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
import org.springframework.context.annotation.Configuration;
import org.springframework.context.annotation.Import;
@@ -24,10 +26,13 @@ public class Neo4jGraphServiceFactory {
@Qualifier("entityRegistry")
private EntityRegistry entityRegistry;
+ @Value("${neo4j.database}")
+ private String neo4jDatabase;
+
@Bean(name = "neo4jGraphService")
@Nonnull
protected Neo4jGraphService getInstance() {
LineageRegistry lineageRegistry = new LineageRegistry(entityRegistry);
- return new Neo4jGraphService(lineageRegistry, neo4jDriver);
+ return new Neo4jGraphService(lineageRegistry, neo4jDriver, SessionConfig.forDatabase(neo4jDatabase));
}
}
From 332d4afaab39e4b9e9ff73a48e3bfec9b21fe0b5 Mon Sep 17 00:00:00 2001
From: Gabe Lyons
Date: Wed, 8 Nov 2023 10:22:09 -0800
Subject: [PATCH 26/33] feat(subtypes): support subtypes for charts in the UI
(#9186)
---
.../java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java | 4 ++++
datahub-graphql-core/src/main/resources/entity.graphql | 5 +++++
datahub-web-react/src/app/entity/chart/ChartEntity.tsx | 4 ++++
.../src/app/entity/chart/preview/ChartPreview.tsx | 5 ++++-
datahub-web-react/src/graphql/chart.graphql | 3 +++
datahub-web-react/src/graphql/lineage.graphql | 3 +++
datahub-web-react/src/graphql/search.graphql | 6 ++++++
metadata-models/src/main/resources/entity-registry.yml | 1 +
8 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
index b99f712034fe03..b0b26f073876c4 100644
--- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
+++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java
@@ -1433,6 +1433,10 @@ private void configureChartResolvers(final RuntimeWiring.Builder builder) {
.dataFetcher("statsSummary", new ChartStatsSummaryResolver(this.timeseriesAspectService))
.dataFetcher("privileges", new EntityPrivilegesResolver(entityClient))
.dataFetcher("exists", new EntityExistsResolver(entityService))
+ .dataFetcher("subTypes", new SubTypesResolver(
+ this.entityClient,
+ "chart",
+ "subTypes"))
);
builder.type("ChartInfo", typeWiring -> typeWiring
.dataFetcher("inputs", new LoadableTypeBatchResolver<>(datasetType,
diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql
index b37a8f34fa0563..035f756a10d557 100644
--- a/datahub-graphql-core/src/main/resources/entity.graphql
+++ b/datahub-graphql-core/src/main/resources/entity.graphql
@@ -5249,6 +5249,11 @@ type Chart implements EntityWithRelationships & Entity & BrowsableEntity {
Whether or not this entity exists on DataHub
"""
exists: Boolean
+
+ """
+ Sub Types that this entity implements
+ """
+ subTypes: SubTypes
}
"""
diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx
index 0f1b6dbf3d660d..fc898dec9d93af 100644
--- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx
+++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx
@@ -154,10 +154,12 @@ export class ChartEntity implements Entity {
getOverridePropertiesFromEntity = (chart?: Chart | null): GenericEntityProperties => {
// TODO: Get rid of this once we have correctly formed platform coming back.
const name = chart?.properties?.name;
+ const subTypes = chart?.subTypes;
const externalUrl = chart?.properties?.externalUrl;
return {
name,
externalUrl,
+ entityTypeOverride: subTypes ? capitalizeFirstLetterOnly(subTypes.typeNames?.[0]) : '',
};
};
@@ -187,6 +189,7 @@ export class ChartEntity implements Entity {
return (
{
type: EntityType.Chart,
icon: entity?.platform?.properties?.logoUrl || undefined,
platform: entity?.platform,
+ subtype: entity?.subTypes?.typeNames?.[0] || undefined,
};
};
diff --git a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx
index 7d0fc143043e29..b7fbd63ee231e3 100644
--- a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx
+++ b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx
@@ -15,6 +15,7 @@ import {
EntityPath,
} from '../../../../types.generated';
import DefaultPreviewCard from '../../../preview/DefaultPreviewCard';
+import { capitalizeFirstLetterOnly } from '../../../shared/textUtil';
import { useEntityRegistry } from '../../../useEntityRegistry';
import { IconStyleType } from '../../Entity';
import { ChartStatsSummary as ChartStatsSummaryView } from '../shared/ChartStatsSummary';
@@ -43,6 +44,7 @@ export const ChartPreview = ({
snippet,
degree,
paths,
+ subType,
}: {
urn: string;
platform?: string;
@@ -67,6 +69,7 @@ export const ChartPreview = ({
snippet?: React.ReactNode | null;
degree?: number;
paths?: EntityPath[];
+ subType?: string | null;
}): JSX.Element => {
const entityRegistry = useEntityRegistry();
@@ -76,7 +79,7 @@ export const ChartPreview = ({
name={name || ''}
urn={urn}
description={description || ''}
- type="Chart"
+ type={capitalizeFirstLetterOnly(subType) || 'Chart'}
typeIcon={entityRegistry.getIcon(EntityType.Chart, 14, IconStyleType.ACCENT)}
logoUrl={logoUrl || ''}
platform={platform}
diff --git a/datahub-web-react/src/graphql/chart.graphql b/datahub-web-react/src/graphql/chart.graphql
index d4d3c3c9184082..a4b430720fa3d5 100644
--- a/datahub-web-react/src/graphql/chart.graphql
+++ b/datahub-web-react/src/graphql/chart.graphql
@@ -100,6 +100,9 @@ query getChart($urn: String!) {
canEditLineage
canEditEmbed
}
+ subTypes {
+ typeNames
+ }
}
}
diff --git a/datahub-web-react/src/graphql/lineage.graphql b/datahub-web-react/src/graphql/lineage.graphql
index 52385dee8631ac..8fdfb696e08943 100644
--- a/datahub-web-react/src/graphql/lineage.graphql
+++ b/datahub-web-react/src/graphql/lineage.graphql
@@ -165,6 +165,9 @@ fragment lineageNodeProperties on EntityWithRelationships {
status {
removed
}
+ subTypes {
+ typeNames
+ }
}
... on Dataset {
name
diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql
index 2297c2d0c1d075..876be12fd335b7 100644
--- a/datahub-web-react/src/graphql/search.graphql
+++ b/datahub-web-react/src/graphql/search.graphql
@@ -105,6 +105,9 @@ fragment autoCompleteFields on Entity {
parentContainers {
...parentContainersFields
}
+ subTypes {
+ typeNames
+ }
}
... on DataFlow {
orchestrator
@@ -550,6 +553,9 @@ fragment searchResultFields on Entity {
}
}
}
+ subTypes {
+ typeNames
+ }
}
... on DataFlow {
flowId
diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml
index 11d0f74305d7be..a5296d074093be 100644
--- a/metadata-models/src/main/resources/entity-registry.yml
+++ b/metadata-models/src/main/resources/entity-registry.yml
@@ -120,6 +120,7 @@ entities:
- globalTags
- glossaryTerms
- browsePathsV2
+ - subTypes
- name: dashboard
keyAspect: dashboardKey
aspects:
From 72135914109a241aa11ceaeb68b9ac56134e7e64 Mon Sep 17 00:00:00 2001
From: Chris Collins
Date: Wed, 8 Nov 2023 14:36:33 -0500
Subject: [PATCH 27/33] feat(ui) Debounce auto-complete in search bar (#9205)
---
datahub-web-react/src/app/home/HomePageHeader.tsx | 6 ++++--
datahub-web-react/src/app/search/SearchablePage.tsx | 6 ++++--
2 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/datahub-web-react/src/app/home/HomePageHeader.tsx b/datahub-web-react/src/app/home/HomePageHeader.tsx
index 0052d54f562ebd..c881109f6e419d 100644
--- a/datahub-web-react/src/app/home/HomePageHeader.tsx
+++ b/datahub-web-react/src/app/home/HomePageHeader.tsx
@@ -1,6 +1,7 @@
import React, { useEffect, useMemo, useState } from 'react';
import { useHistory } from 'react-router';
import { Typography, Image, Row, Button, Tag } from 'antd';
+import { debounce } from 'lodash';
import styled, { useTheme } from 'styled-components/macro';
import { RightOutlined } from '@ant-design/icons';
import { ManageAccount } from '../shared/ManageAccount';
@@ -24,6 +25,7 @@ import { getAutoCompleteInputFromQuickFilter } from '../search/utils/filterUtils
import { useUserContext } from '../context/useUserContext';
import AcrylDemoBanner from './AcrylDemoBanner';
import DemoButton from '../entity/shared/components/styled/DemoButton';
+import { HALF_SECOND_IN_MS } from '../entity/shared/tabs/Dataset/Queries/utils/constants';
const Background = styled.div`
width: 100%;
@@ -176,7 +178,7 @@ export const HomePageHeader = () => {
});
};
- const onAutoComplete = (query: string) => {
+ const onAutoComplete = debounce((query: string) => {
if (query && query.trim() !== '') {
getAutoCompleteResultsForMultiple({
variables: {
@@ -189,7 +191,7 @@ export const HomePageHeader = () => {
},
});
}
- };
+ }, HALF_SECOND_IN_MS);
const onClickExploreAll = () => {
analytics.event({
diff --git a/datahub-web-react/src/app/search/SearchablePage.tsx b/datahub-web-react/src/app/search/SearchablePage.tsx
index 489687050c749d..9d02d85d3634c0 100644
--- a/datahub-web-react/src/app/search/SearchablePage.tsx
+++ b/datahub-web-react/src/app/search/SearchablePage.tsx
@@ -1,5 +1,6 @@
import React, { useEffect, useState } from 'react';
import { useHistory, useLocation } from 'react-router';
+import { debounce } from 'lodash';
import * as QueryString from 'query-string';
import { useTheme } from 'styled-components';
import { SearchHeader } from './SearchHeader';
@@ -17,6 +18,7 @@ import { getAutoCompleteInputFromQuickFilter } from './utils/filterUtils';
import { useQuickFiltersContext } from '../../providers/QuickFiltersContext';
import { useUserContext } from '../context/useUserContext';
import { useSelectedSortOption } from './context/SearchContext';
+import { HALF_SECOND_IN_MS } from '../entity/shared/tabs/Dataset/Queries/utils/constants';
const styles = {
children: {
@@ -93,7 +95,7 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) =>
});
};
- const autoComplete = (query: string) => {
+ const autoComplete = debounce((query: string) => {
if (query && query.trim() !== '') {
getAutoCompleteResults({
variables: {
@@ -105,7 +107,7 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) =>
},
});
}
- };
+ }, HALF_SECOND_IN_MS);
// Load correct autocomplete results on initial page load.
useEffect(() => {
From 70692b44e995eab252a2344496141acdf6181908 Mon Sep 17 00:00:00 2001
From: Gabe Lyons
Date: Wed, 8 Nov 2023 12:49:23 -0800
Subject: [PATCH 28/33] fix(lineage): magical lineage layout fix (#9187)
---
.../src/app/lineage/utils/layoutTree.ts | 21 +++++++++++++++----
1 file changed, 17 insertions(+), 4 deletions(-)
diff --git a/datahub-web-react/src/app/lineage/utils/layoutTree.ts b/datahub-web-react/src/app/lineage/utils/layoutTree.ts
index cc704007049c20..a972a62308f073 100644
--- a/datahub-web-react/src/app/lineage/utils/layoutTree.ts
+++ b/datahub-web-react/src/app/lineage/utils/layoutTree.ts
@@ -32,6 +32,21 @@ function getParentRelationship(direction: Direction, parent: VizNode | null, nod
return directionRelationships?.find((r) => r?.entity?.urn === node?.urn);
}
+// this utility function is to help make sure layouts that contain many references to the same URN don't struggle laying out that URN.
+function firstAppearanceIndices(arr) {
+ const seen = new Set(); // To track which strings have been seen
+ const result = [] as number[];
+
+ for (let i = 0; i < arr.length; i++) {
+ if (!seen.has(arr[i])) {
+ seen.add(arr[i]); // Add the string to the set
+ result.push(i); // Save the index
+ }
+ }
+
+ return result;
+}
+
function layoutNodesForOneDirection(
data: NodeData,
direction: Direction,
@@ -54,12 +69,10 @@ function layoutNodesForOneDirection(
while (nodesInCurrentLayer.length > 0) {
// if we've already added a node to the viz higher up dont add it again
const urnsToAddInCurrentLayer = Array.from(new Set(nodesInCurrentLayer.map(({ node }) => node.urn || '')));
- const nodesToAddInCurrentLayer = urnsToAddInCurrentLayer
- .filter((urn, pos) => urnsToAddInCurrentLayer.indexOf(urn) === pos)
- .filter((urn) => !nodesByUrn[urn || '']);
+ const positionsToAddInCurrentLayer = firstAppearanceIndices(urnsToAddInCurrentLayer);
const filteredNodesInCurrentLayer = nodesInCurrentLayer
- .filter(({ node }) => nodesToAddInCurrentLayer.indexOf(node.urn || '') > -1)
+ .filter((_, idx) => positionsToAddInCurrentLayer.indexOf(idx) > -1)
.filter(({ node }) => node.status?.removed !== true);
const layerSize = filteredNodesInCurrentLayer.length;
From f87983d69dc62db5c58dc114f8796dcb9eb1cc95 Mon Sep 17 00:00:00 2001
From: John Joyce
Date: Wed, 8 Nov 2023 13:29:37 -0800
Subject: [PATCH 29/33] refactor(pdl): Refactoring Assertion model enums out
(#9191)
Co-authored-by: Harshal Sheth
---
.../linkedin/assertion/AssertionResult.pdl | 19 +--------------
.../assertion/AssertionResultType.pdl | 23 +++++++++++++++++++
.../linkedin/assertion/AssertionRunEvent.pdl | 7 +-----
.../linkedin/assertion/AssertionRunStatus.pdl | 12 ++++++++++
4 files changed, 37 insertions(+), 24 deletions(-)
create mode 100644 metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl
create mode 100644 metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl
diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl
index ded84e1969153b..935f3e5976dfa5 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl
@@ -9,24 +9,7 @@ record AssertionResult {
*/
@TimeseriesField = {}
@Searchable = {}
- type: enum AssertionResultType {
- /**
- * The Assertion has not yet been fully evaluated
- */
- INIT
- /**
- * The Assertion Succeeded
- */
- SUCCESS
- /**
- * The Assertion Failed
- */
- FAILURE
- /**
- * The Assertion encountered an Error
- */
- ERROR
- }
+ type: AssertionResultType
/**
* Number of rows for evaluated batch
diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl
new file mode 100644
index 00000000000000..8954d94cced7bf
--- /dev/null
+++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl
@@ -0,0 +1,23 @@
+namespace com.linkedin.assertion
+
+/**
+* The final result of evaluating an assertion, e.g. SUCCESS, FAILURE, or ERROR.
+*/
+enum AssertionResultType {
+ /**
+ * The Assertion has not yet been fully evaluated
+ */
+ INIT
+ /**
+ * The Assertion Succeeded
+ */
+ SUCCESS
+ /**
+ * The Assertion Failed
+ */
+ FAILURE
+ /**
+ * The Assertion encountered an Error
+ */
+ ERROR
+}
\ No newline at end of file
diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl
index 14f12042327404..55bcae77273dbd 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl
@@ -39,12 +39,7 @@ record AssertionRunEvent {
* The status of the assertion run as per this timeseries event.
*/
@TimeseriesField = {}
- status: enum AssertionRunStatus {
- /**
- * The Assertion Run has completed
- */
- COMPLETE
- }
+ status: AssertionRunStatus
/**
* Results of assertion, present if the status is COMPLETE
diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl
new file mode 100644
index 00000000000000..e4e17925ede82a
--- /dev/null
+++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl
@@ -0,0 +1,12 @@
+namespace com.linkedin.assertion
+
+
+/**
+* The lifecycle status of an assertion run.
+*/
+enum AssertionRunStatus {
+ /**
+ * The Assertion Run has completed
+ */
+ COMPLETE
+}
\ No newline at end of file
From f38c8087bb508a779d94d04967a9c449f6d93126 Mon Sep 17 00:00:00 2001
From: Pedro Silva
Date: Wed, 8 Nov 2023 22:38:15 +0000
Subject: [PATCH 30/33] feat(auth): Add roles to policy engine validation logic
(#9178)
---
.../authorization/AuthorizedActors.java | 1 +
.../authorization/AuthorizerChain.java | 5 +
.../authorization/DataHubAuthorizer.java | 8 +-
.../datahub/authorization/PolicyEngine.java | 43 +++-----
.../authorization/DataHubAuthorizerTest.java | 97 ++++++++++++++++---
.../authorization/PolicyEngineTest.java | 54 ++++++++++-
.../datahub/plugins/test/TestAuthorizer.java | 2 +-
7 files changed, 162 insertions(+), 48 deletions(-)
diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java
index aec99e1b1e57a8..5a9990552bb34a 100644
--- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java
+++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java
@@ -15,6 +15,7 @@ public class AuthorizedActors {
String privilege;
List users;
List groups;
+ List roles;
boolean allUsers;
boolean allGroups;
}
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java
index f8eca541e1efb4..7e7a1de176f06d 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java
@@ -126,11 +126,16 @@ private AuthorizedActors mergeAuthorizedActors(@Nullable AuthorizedActors origin
mergedGroups = new ArrayList<>(groups);
}
+ Set roles = new HashSet<>(original.getRoles());
+ roles.addAll(other.getRoles());
+ List mergedRoles = new ArrayList<>(roles);
+
return AuthorizedActors.builder()
.allUsers(original.isAllUsers() || other.isAllUsers())
.allGroups(original.isAllGroups() || other.isAllGroups())
.users(mergedUsers)
.groups(mergedGroups)
+ .roles(mergedRoles)
.build();
}
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
index f8f99475de23e2..956d635c7901ac 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java
@@ -133,6 +133,7 @@ public AuthorizedActors authorizedActors(
final List authorizedUsers = new ArrayList<>();
final List authorizedGroups = new ArrayList<>();
+ final List authorizedRoles = new ArrayList<>();
boolean allUsers = false;
boolean allGroups = false;
@@ -153,16 +154,17 @@ public AuthorizedActors authorizedActors(
// Step 3: For each matching policy, add actors that are authorized.
authorizedUsers.addAll(matchingActors.getUsers());
authorizedGroups.addAll(matchingActors.getGroups());
- if (matchingActors.allUsers()) {
+ authorizedRoles.addAll(matchingActors.getRoles());
+ if (matchingActors.getAllUsers()) {
allUsers = true;
}
- if (matchingActors.allGroups()) {
+ if (matchingActors.getAllGroups()) {
allGroups = true;
}
}
// Step 4: Return all authorized users and groups.
- return new AuthorizedActors(privilege, authorizedUsers, authorizedGroups, allUsers, allGroups);
+ return new AuthorizedActors(privilege, authorizedUsers, authorizedGroups, authorizedRoles, allUsers, allGroups);
}
/**
diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java
index f8c017ea74e1f6..da0ae26f2b1da6 100644
--- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java
+++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java
@@ -32,7 +32,10 @@
import java.util.stream.Stream;
import javax.annotation.Nullable;
+import lombok.AccessLevel;
+import lombok.AllArgsConstructor;
import lombok.RequiredArgsConstructor;
+import lombok.Value;
import lombok.extern.slf4j.Slf4j;
import static com.linkedin.metadata.Constants.*;
@@ -75,6 +78,7 @@ public PolicyActors getMatchingActors(
final Optional resource) {
final List users = new ArrayList<>();
final List groups = new ArrayList<>();
+ final List roles = new ArrayList<>();
boolean allUsers = false;
boolean allGroups = false;
if (policyMatchesResource(policy, resource)) {
@@ -96,6 +100,9 @@ public PolicyActors getMatchingActors(
if (actorFilter.getGroups() != null) {
groups.addAll(actorFilter.getGroups());
}
+ if (actorFilter.getRoles() != null) {
+ roles.addAll(actorFilter.getRoles());
+ }
// 2. Fetch Actors based on resource ownership.
if (actorFilter.isResourceOwners() && resource.isPresent()) {
@@ -104,7 +111,7 @@ public PolicyActors getMatchingActors(
groups.addAll(groupOwners(owners));
}
}
- return new PolicyActors(users, groups, allUsers, allGroups);
+ return new PolicyActors(users, groups, roles, allUsers, allGroups);
}
private boolean isPolicyApplicable(
@@ -438,34 +445,14 @@ public boolean isGranted() {
/**
* Class used to represent all valid users of a policy.
*/
+ @Value
+ @AllArgsConstructor(access = AccessLevel.PUBLIC)
public static class PolicyActors {
- final List _users;
- final List _groups;
- final Boolean _allUsers;
- final Boolean _allGroups;
-
- public PolicyActors(final List users, final List groups, final Boolean allUsers, final Boolean allGroups) {
- _users = users;
- _groups = groups;
- _allUsers = allUsers;
- _allGroups = allGroups;
- }
-
- public List getUsers() {
- return _users;
- }
-
- public List getGroups() {
- return _groups;
- }
-
- public Boolean allUsers() {
- return _allUsers;
- }
-
- public Boolean allGroups() {
- return _allGroups;
- }
+ List users;
+ List groups;
+ List roles;
+ Boolean allUsers;
+ Boolean allGroups;
}
private List userOwners(final Set owners) {
diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java
index babb1c5d00ee8a..b0b206001209c7 100644
--- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java
+++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java
@@ -21,6 +21,7 @@
import com.linkedin.entity.EnvelopedAspect;
import com.linkedin.entity.EnvelopedAspectMap;
import com.linkedin.entity.client.EntityClient;
+import com.linkedin.identity.RoleMembership;
import com.linkedin.metadata.query.SearchFlags;
import com.linkedin.metadata.search.ScrollResult;
import com.linkedin.metadata.search.SearchEntity;
@@ -55,6 +56,7 @@
import static org.mockito.Mockito.when;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertTrue;
+import static org.testng.Assert.assertFalse;
public class DataHubAuthorizerTest {
@@ -63,6 +65,7 @@ public class DataHubAuthorizerTest {
private static final Urn PARENT_DOMAIN_URN = UrnUtils.getUrn("urn:li:domain:parent");
private static final Urn CHILD_DOMAIN_URN = UrnUtils.getUrn("urn:li:domain:child");
+ private static final Urn USER_WITH_ADMIN_ROLE = UrnUtils.getUrn("urn:li:corpuser:user-with-admin");
private EntityClient _entityClient;
private DataHubAuthorizer _dataHubAuthorizer;
@@ -92,40 +95,56 @@ public void setupTest() throws Exception {
final EnvelopedAspectMap childDomainPolicyAspectMap = new EnvelopedAspectMap();
childDomainPolicyAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(childDomainPolicy.data())));
+ final Urn adminPolicyUrn = Urn.createFromString("urn:li:dataHubPolicy:4");
+ final DataHubActorFilter actorFilter = new DataHubActorFilter();
+ actorFilter.setRoles(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:dataHubRole:Admin"))));
+ final DataHubPolicyInfo adminPolicy = createDataHubPolicyInfoFor(true, ImmutableList.of("EDIT_USER_PROFILE"), null, actorFilter);
+ final EnvelopedAspectMap adminPolicyAspectMap = new EnvelopedAspectMap();
+ adminPolicyAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(adminPolicy.data())));
+
final ScrollResult policySearchResult1 = new ScrollResult()
.setScrollId("1")
- .setNumEntities(4)
+ .setNumEntities(5)
.setEntities(
new SearchEntityArray(
ImmutableList.of(new SearchEntity().setEntity(activePolicyUrn))));
final ScrollResult policySearchResult2 = new ScrollResult()
.setScrollId("2")
- .setNumEntities(4)
+ .setNumEntities(5)
.setEntities(
new SearchEntityArray(
ImmutableList.of(new SearchEntity().setEntity(inactivePolicyUrn))));
final ScrollResult policySearchResult3 = new ScrollResult()
.setScrollId("3")
- .setNumEntities(4)
+ .setNumEntities(5)
.setEntities(
new SearchEntityArray(
ImmutableList.of(new SearchEntity().setEntity(parentDomainPolicyUrn))));
final ScrollResult policySearchResult4 = new ScrollResult()
- .setNumEntities(4)
+ .setScrollId("4")
+ .setNumEntities(5)
.setEntities(
new SearchEntityArray(
ImmutableList.of(
new SearchEntity().setEntity(childDomainPolicyUrn))));
+ final ScrollResult policySearchResult5 = new ScrollResult()
+ .setNumEntities(5)
+ .setEntities(
+ new SearchEntityArray(
+ ImmutableList.of(
+ new SearchEntity().setEntity(adminPolicyUrn))));
+
when(_entityClient.scrollAcrossEntities(eq(List.of("dataHubPolicy")), eq(""), isNull(), any(), isNull(),
anyInt(), eq(new SearchFlags().setFulltext(true).setSkipAggregates(true).setSkipHighlighting(true).setSkipCache(true)), any()))
.thenReturn(policySearchResult1)
.thenReturn(policySearchResult2)
.thenReturn(policySearchResult3)
- .thenReturn(policySearchResult4);
+ .thenReturn(policySearchResult4)
+ .thenReturn(policySearchResult5);
when(_entityClient.batchGetV2(eq(POLICY_ENTITY_NAME), any(), eq(null), any())).thenAnswer(args -> {
Set inputUrns = args.getArgument(1);
@@ -140,6 +159,8 @@ public void setupTest() throws Exception {
return Map.of(parentDomainPolicyUrn, new EntityResponse().setUrn(parentDomainPolicyUrn).setAspects(parentDomainPolicyAspectMap));
case "urn:li:dataHubPolicy:3":
return Map.of(childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspects(childDomainPolicyAspectMap));
+ case "urn:li:dataHubPolicy:4":
+ return Map.of(adminPolicyUrn, new EntityResponse().setUrn(adminPolicyUrn).setAspects(adminPolicyAspectMap));
default:
throw new IllegalStateException();
}
@@ -167,6 +188,10 @@ public void setupTest() throws Exception {
when(_entityClient.batchGetV2(any(), eq(Collections.singleton(PARENT_DOMAIN_URN)), eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)), any()))
.thenReturn(createDomainPropertiesBatchResponse(null));
+ // Mocks to reach role membership for a user urn
+ when(_entityClient.batchGetV2(any(), eq(Collections.singleton(USER_WITH_ADMIN_ROLE)), eq(Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME)), any())
+ ).thenReturn(createUserRoleMembershipBatchResponse(USER_WITH_ADMIN_ROLE, UrnUtils.getUrn("urn:li:dataHubRole:Admin")));
+
final Authentication systemAuthentication = new Authentication(
new Actor(ActorType.USER, DATAHUB_SYSTEM_CLIENT_ID),
""
@@ -302,6 +327,32 @@ public void testAuthorizedActorsActivePolicy() throws Exception {
));
}
+ @Test
+ public void testAuthorizedRoleActivePolicy() throws Exception {
+ final AuthorizedActors actors =
+ _dataHubAuthorizer.authorizedActors("EDIT_USER_PROFILE", // Should be inside the active policy.
+ Optional.of(new EntitySpec("dataset", "urn:li:dataset:1")));
+
+ assertFalse(actors.isAllUsers());
+ assertFalse(actors.isAllGroups());
+ assertEquals(new HashSet<>(actors.getUsers()), ImmutableSet.of());
+ assertEquals(new HashSet<>(actors.getGroups()), ImmutableSet.of());
+ assertEquals(new HashSet<>(actors.getRoles()), ImmutableSet.of(UrnUtils.getUrn("urn:li:dataHubRole:Admin")));
+ }
+
+ @Test
+ public void testAuthorizationBasedOnRoleIsAllowed() {
+ EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test");
+
+ AuthorizationRequest request = new AuthorizationRequest(
+ USER_WITH_ADMIN_ROLE.toString(),
+ "EDIT_USER_PROFILE",
+ Optional.of(resourceSpec)
+ );
+
+ assertEquals(_dataHubAuthorizer.authorize(request).getType(), AuthorizationResult.Type.ALLOW);
+ }
+
@Test
public void testAuthorizationOnDomainWithPrivilegeIsAllowed() {
EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test");
@@ -342,13 +393,6 @@ public void testAuthorizationOnDomainWithoutPrivilegeIsDenied() {
}
private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List privileges, @Nullable final Urn domain) throws Exception {
- final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo();
- dataHubPolicyInfo.setType(METADATA_POLICY_TYPE);
- dataHubPolicyInfo.setState(active ? ACTIVE_POLICY_STATE : INACTIVE_POLICY_STATE);
- dataHubPolicyInfo.setPrivileges(new StringArray(privileges));
- dataHubPolicyInfo.setDisplayName("My Test Display");
- dataHubPolicyInfo.setDescription("My test display!");
- dataHubPolicyInfo.setEditable(true);
List users = ImmutableList.of(Urn.createFromString("urn:li:corpuser:user1"), Urn.createFromString("urn:li:corpuser:user2"));
List groups = ImmutableList.of(Urn.createFromString("urn:li:corpGroup:group1"), Urn.createFromString("urn:li:corpGroup:group2"));
@@ -359,6 +403,20 @@ private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List p
actorFilter.setAllGroups(true);
actorFilter.setUsers(new UrnArray(users));
actorFilter.setGroups(new UrnArray(groups));
+
+ return createDataHubPolicyInfoFor(active, privileges, domain, actorFilter);
+ }
+
+ private DataHubPolicyInfo createDataHubPolicyInfoFor(boolean active, List privileges,
+ @Nullable final Urn domain, DataHubActorFilter actorFilter) throws Exception {
+ final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo();
+ dataHubPolicyInfo.setType(METADATA_POLICY_TYPE);
+ dataHubPolicyInfo.setState(active ? ACTIVE_POLICY_STATE : INACTIVE_POLICY_STATE);
+ dataHubPolicyInfo.setPrivileges(new StringArray(privileges));
+ dataHubPolicyInfo.setDisplayName("My Test Display");
+ dataHubPolicyInfo.setDescription("My test display!");
+ dataHubPolicyInfo.setEditable(true);
+
dataHubPolicyInfo.setActors(actorFilter);
final DataHubResourceFilter resourceFilter = new DataHubResourceFilter();
@@ -429,6 +487,21 @@ private Map createDomainPropertiesBatchResponse(@Nullable f
return batchResponse;
}
+ private Map createUserRoleMembershipBatchResponse(final Urn userUrn, @Nullable final Urn roleUrn) {
+ final Map batchResponse = new HashMap<>();
+ final EntityResponse response = new EntityResponse();
+ EnvelopedAspectMap aspectMap = new EnvelopedAspectMap();
+ final RoleMembership membership = new RoleMembership();
+ if (roleUrn != null) {
+ membership.setRoles(new UrnArray(roleUrn));
+ }
+ aspectMap.put(ROLE_MEMBERSHIP_ASPECT_NAME, new EnvelopedAspect()
+ .setValue(new com.linkedin.entity.Aspect(membership.data())));
+ response.setAspects(aspectMap);
+ batchResponse.put(userUrn, response);
+ return batchResponse;
+ }
+
private AuthorizerContext createAuthorizerContext(final Authentication systemAuthentication, final EntityClient entityClient) {
return new AuthorizerContext(Collections.emptyMap(), new DefaultEntitySpecResolver(systemAuthentication, entityClient));
}
diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java
index be8c948f8ef897..2790c16ba75e62 100644
--- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java
+++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java
@@ -1041,6 +1041,7 @@ public void testGetMatchingActorsResourceMatch() throws Exception {
Urn.createFromString("urn:li:corpuser:user2"))));
actorFilter.setGroups(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:corpGroup:group1"),
Urn.createFromString("urn:li:corpGroup:group2"))));
+ actorFilter.setRoles(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:role:Admin"))));
dataHubPolicyInfo.setActors(actorFilter);
final DataHubResourceFilter resourceFilter = new DataHubResourceFilter();
@@ -1056,8 +1057,8 @@ public void testGetMatchingActorsResourceMatch() throws Exception {
Collections.emptySet(), Collections.emptySet());
PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec));
- assertTrue(actors.allUsers());
- assertTrue(actors.allGroups());
+ assertTrue(actors.getAllUsers());
+ assertTrue(actors.getAllGroups());
assertEquals(actors.getUsers(),
ImmutableList.of(Urn.createFromString("urn:li:corpuser:user1"), Urn.createFromString("urn:li:corpuser:user2"),
@@ -1068,6 +1069,8 @@ public void testGetMatchingActorsResourceMatch() throws Exception {
Urn.createFromString("urn:li:corpGroup:group2"), Urn.createFromString(AUTHORIZED_GROUP) // Resource Owner
));
+ assertEquals(actors.getRoles(), ImmutableList.of(Urn.createFromString("urn:li:role:Admin")));
+
// Verify aspect client called, entity client not called.
verify(_entityClient, times(0)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)),
eq(null), any());
@@ -1106,15 +1109,58 @@ public void testGetMatchingActorsNoResourceMatch() throws Exception {
buildEntityResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy.
PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec));
- assertFalse(actors.allUsers());
- assertFalse(actors.allGroups());
+ assertFalse(actors.getAllUsers());
+ assertFalse(actors.getAllGroups());
assertEquals(actors.getUsers(), Collections.emptyList());
assertEquals(actors.getGroups(), Collections.emptyList());
+ //assertEquals(actors.getRoles(), Collections.emptyList());
// Verify no network calls
verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any());
}
+ @Test
+ public void testGetMatchingActorsByRoleResourceMatch() throws Exception {
+ final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo();
+ dataHubPolicyInfo.setType(METADATA_POLICY_TYPE);
+ dataHubPolicyInfo.setState(ACTIVE_POLICY_STATE);
+ dataHubPolicyInfo.setPrivileges(new StringArray("EDIT_ENTITY_TAGS"));
+ dataHubPolicyInfo.setDisplayName("My Test Display");
+ dataHubPolicyInfo.setDescription("My test display!");
+ dataHubPolicyInfo.setEditable(true);
+
+ final DataHubActorFilter actorFilter = new DataHubActorFilter();
+ actorFilter.setResourceOwners(true);
+ actorFilter.setAllUsers(false);
+ actorFilter.setAllGroups(false);
+ actorFilter.setRoles(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:dataHubRole:Editor"))));
+ dataHubPolicyInfo.setActors(actorFilter);
+
+ final DataHubResourceFilter resourceFilter = new DataHubResourceFilter();
+ resourceFilter.setAllResources(false);
+ resourceFilter.setType("dataset");
+ StringArray resourceUrns = new StringArray();
+ resourceUrns.add(RESOURCE_URN);
+ resourceFilter.setResources(resourceUrns);
+ dataHubPolicyInfo.setResources(resourceFilter);
+
+ ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(),
+ Collections.emptySet(), Collections.emptySet());
+
+ PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec));
+
+ assertFalse(actors.getAllUsers());
+ assertFalse(actors.getAllGroups());
+
+ assertEquals(actors.getUsers(), ImmutableList.of());
+ assertEquals(actors.getGroups(), ImmutableList.of());
+ assertEquals(actors.getRoles(), ImmutableList.of(Urn.createFromString("urn:li:dataHubRole:Editor")));
+
+ // Verify aspect client called, entity client not called.
+ verify(_entityClient, times(0)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)),
+ eq(null), any());
+ }
+
private Ownership createOwnershipAspect(final Boolean addUserOwner, final Boolean addGroupOwner) throws Exception {
final Ownership ownershipAspect = new Ownership();
final OwnerArray owners = new OwnerArray();
diff --git a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java
index 442ac1b0d287b3..e5f3e223ff505d 100644
--- a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java
+++ b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java
@@ -75,7 +75,7 @@ public AuthorizationResult authorize(@Nonnull AuthorizationRequest request) {
@Override
public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) {
- return new AuthorizedActors("ALL", null, null, true, true);
+ return new AuthorizedActors("ALL", null, null, null, true, true);
}
}
From f73ecfdcbbc35437fcb80c9e27e78908dae23ea7 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz
Date: Wed, 8 Nov 2023 18:17:49 -0500
Subject: [PATCH 31/33] style(ingest/tableau): Rename tableau_constant to c
(#9207)
---
.../src/datahub/ingestion/source/tableau.py | 597 ++++++++----------
.../ingestion/source/tableau_common.py | 14 +-
2 files changed, 272 insertions(+), 339 deletions(-)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
index 4bc40b0aac9649..08df7599510f47 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
@@ -59,7 +59,7 @@
)
from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source
from datahub.ingestion.api.workunit import MetadataWorkUnit
-from datahub.ingestion.source import tableau_constant
+from datahub.ingestion.source import tableau_constant as c
from datahub.ingestion.source.common.subtypes import (
BIContainerSubTypes,
DatasetSubTypes,
@@ -720,16 +720,12 @@ def get_connection_object_page(
query, connection_type, query_filter, count, offset, False
)
- if tableau_constant.ERRORS in query_data:
- errors = query_data[tableau_constant.ERRORS]
+ if c.ERRORS in query_data:
+ errors = query_data[c.ERRORS]
if all(
# The format of the error messages is highly unpredictable, so we have to
# be extra defensive with our parsing.
- error
- and (error.get(tableau_constant.EXTENSIONS) or {}).get(
- tableau_constant.SEVERITY
- )
- == tableau_constant.WARNING
+ error and (error.get(c.EXTENSIONS) or {}).get(c.SEVERITY) == c.WARNING
for error in errors
):
self.report.report_warning(key=connection_type, reason=f"{errors}")
@@ -737,14 +733,14 @@ def get_connection_object_page(
raise RuntimeError(f"Query {connection_type} error: {errors}")
connection_object = (
- query_data.get(tableau_constant.DATA).get(connection_type, {})
- if query_data.get(tableau_constant.DATA)
+ query_data.get(c.DATA).get(connection_type, {})
+ if query_data.get(c.DATA)
else {}
)
- total_count = connection_object.get(tableau_constant.TOTAL_COUNT, 0)
- has_next_page = connection_object.get(tableau_constant.PAGE_INFO, {}).get(
- tableau_constant.HAS_NEXT_PAGE, False
+ total_count = connection_object.get(c.TOTAL_COUNT, 0)
+ has_next_page = connection_object.get(c.PAGE_INFO, {}).get(
+ c.HAS_NEXT_PAGE, False
)
return connection_object, total_count, has_next_page
@@ -781,7 +777,7 @@ def get_connection_objects(
offset += count
- for obj in connection_objects.get(tableau_constant.NODES) or []:
+ for obj in connection_objects.get(c.NODES) or []:
yield obj
def emit_workbooks(self) -> Iterable[MetadataWorkUnit]:
@@ -790,11 +786,11 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]:
project.name for project in self.tableau_project_registry.values()
]
project_names_str: str = json.dumps(project_names)
- projects = f"{tableau_constant.PROJECT_NAME_WITH_IN}: {project_names_str}"
+ projects = f"{c.PROJECT_NAME_WITH_IN}: {project_names_str}"
for workbook in self.get_connection_objects(
workbook_graphql_query,
- tableau_constant.WORKBOOKS_CONNECTION,
+ c.WORKBOOKS_CONNECTION,
projects,
page_size_override=self.config.workbook_page_size,
):
@@ -804,11 +800,9 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]:
# however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later.
project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
if project_luid not in self.tableau_project_registry.keys():
- wrk_name: Optional[str] = workbook.get(tableau_constant.NAME)
- wrk_id: Optional[str] = workbook.get(tableau_constant.ID)
- prj_name: Optional[str] = workbook.get(
- tableau_constant.PROJECT_NAME
- )
+ wrk_name: Optional[str] = workbook.get(c.NAME)
+ wrk_id: Optional[str] = workbook.get(c.ID)
+ prj_name: Optional[str] = workbook.get(c.PROJECT_NAME)
logger.debug(
f"Skipping workbook {wrk_name}({wrk_id}) as it is project {prj_name}({project_luid}) not "
@@ -818,25 +812,22 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]:
yield from self.emit_workbook_as_container(workbook)
- for sheet in workbook.get(tableau_constant.SHEETS, []):
- self.sheet_ids.append(sheet[tableau_constant.ID])
+ for sheet in workbook.get(c.SHEETS, []):
+ self.sheet_ids.append(sheet[c.ID])
- for dashboard in workbook.get(tableau_constant.DASHBOARDS, []):
- self.dashboard_ids.append(dashboard[tableau_constant.ID])
+ for dashboard in workbook.get(c.DASHBOARDS, []):
+ self.dashboard_ids.append(dashboard[c.ID])
- for ds in workbook.get(tableau_constant.EMBEDDED_DATA_SOURCES, []):
- self.embedded_datasource_ids_being_used.append(
- ds[tableau_constant.ID]
- )
+ for ds in workbook.get(c.EMBEDDED_DATA_SOURCES, []):
+ self.embedded_datasource_ids_being_used.append(ds[c.ID])
def _track_custom_sql_ids(self, field: dict) -> None:
# Tableau shows custom sql datasource as a table in ColumnField's upstreamColumns.
- for column in field.get(tableau_constant.UPSTREAM_COLUMNS, []):
+ for column in field.get(c.UPSTREAM_COLUMNS, []):
table_id = (
- column.get(tableau_constant.TABLE, {}).get(tableau_constant.ID)
- if column.get(tableau_constant.TABLE)
- and column[tableau_constant.TABLE][tableau_constant.TYPE_NAME]
- == tableau_constant.CUSTOM_SQL_TABLE
+ column.get(c.TABLE, {}).get(c.ID)
+ if column.get(c.TABLE)
+ and column[c.TABLE][c.TYPE_NAME] == c.CUSTOM_SQL_TABLE
else None
)
@@ -861,15 +852,15 @@ def _create_upstream_table_lineage(
# and published datasource have same upstreamTables in this case.
if upstream_tables and is_embedded_ds:
logger.debug(
- f"Embedded datasource {datasource.get(tableau_constant.ID)} has upstreamDatasources.\
+ f"Embedded datasource {datasource.get(c.ID)} has upstreamDatasources.\
Setting only upstreamDatasources lineage. The upstreamTables lineage \
will be set via upstream published datasource."
)
else:
# This adds an edge to upstream DatabaseTables using `upstreamTables`
upstreams, id_to_urn = self.get_upstream_tables(
- datasource.get(tableau_constant.UPSTREAM_TABLES, []),
- datasource.get(tableau_constant.NAME),
+ datasource.get(c.UPSTREAM_TABLES, []),
+ datasource.get(c.NAME),
browse_path,
is_custom_sql=False,
)
@@ -878,23 +869,23 @@ def _create_upstream_table_lineage(
# This adds an edge to upstream CustomSQLTables using `fields`.`upstreamColumns`.`table`
csql_upstreams, csql_id_to_urn = self.get_upstream_csql_tables(
- datasource.get(tableau_constant.FIELDS) or [],
+ datasource.get(c.FIELDS) or [],
)
upstream_tables.extend(csql_upstreams)
table_id_to_urn.update(csql_id_to_urn)
logger.debug(
- f"A total of {len(upstream_tables)} upstream table edges found for datasource {datasource[tableau_constant.ID]}"
+ f"A total of {len(upstream_tables)} upstream table edges found for datasource {datasource[c.ID]}"
)
datasource_urn = builder.make_dataset_urn_with_platform_instance(
platform=self.platform,
- name=datasource[tableau_constant.ID],
+ name=datasource[c.ID],
platform_instance=self.config.platform_instance,
env=self.config.env,
)
- if datasource.get(tableau_constant.FIELDS):
+ if datasource.get(c.FIELDS):
if self.config.extract_column_level_lineage:
# Find fine grained lineage for datasource column to datasource column edge,
# upstream columns may be from same datasource
@@ -912,20 +903,20 @@ def _create_upstream_table_lineage(
fine_grained_lineages.extend(upstream_columns)
logger.debug(
- f"A total of {len(fine_grained_lineages)} upstream column edges found for datasource {datasource[tableau_constant.ID]}"
+ f"A total of {len(fine_grained_lineages)} upstream column edges found for datasource {datasource[c.ID]}"
)
return upstream_tables, fine_grained_lineages
def get_upstream_datasources(self, datasource: dict) -> List[Upstream]:
upstream_tables = []
- for ds in datasource.get(tableau_constant.UPSTREAM_DATA_SOURCES, []):
- if ds[tableau_constant.ID] not in self.datasource_ids_being_used:
- self.datasource_ids_being_used.append(ds[tableau_constant.ID])
+ for ds in datasource.get(c.UPSTREAM_DATA_SOURCES, []):
+ if ds[c.ID] not in self.datasource_ids_being_used:
+ self.datasource_ids_being_used.append(ds[c.ID])
upstream_ds_urn = builder.make_dataset_urn_with_platform_instance(
platform=self.platform,
- name=ds[tableau_constant.ID],
+ name=ds[c.ID],
platform_instance=self.config.platform_instance,
env=self.config.env,
)
@@ -943,20 +934,15 @@ def get_upstream_csql_tables(
csql_id_to_urn = {}
for field in fields:
- if not field.get(tableau_constant.UPSTREAM_COLUMNS):
+ if not field.get(c.UPSTREAM_COLUMNS):
continue
- for upstream_col in field[tableau_constant.UPSTREAM_COLUMNS]:
+ for upstream_col in field[c.UPSTREAM_COLUMNS]:
if (
upstream_col
- and upstream_col.get(tableau_constant.TABLE)
- and upstream_col.get(tableau_constant.TABLE)[
- tableau_constant.TYPE_NAME
- ]
- == tableau_constant.CUSTOM_SQL_TABLE
+ and upstream_col.get(c.TABLE)
+ and upstream_col.get(c.TABLE)[c.TYPE_NAME] == c.CUSTOM_SQL_TABLE
):
- upstream_table_id = upstream_col.get(tableau_constant.TABLE)[
- tableau_constant.ID
- ]
+ upstream_table_id = upstream_col.get(c.TABLE)[c.ID]
csql_urn = builder.make_dataset_urn_with_platform_instance(
platform=self.platform,
@@ -986,18 +972,18 @@ def get_upstream_tables(
for table in tables:
# skip upstream tables when there is no column info when retrieving datasource
# Lineage and Schema details for these will be taken care in self.emit_custom_sql_datasources()
- num_tbl_cols: Optional[int] = table.get(
- tableau_constant.COLUMNS_CONNECTION
- ) and table[tableau_constant.COLUMNS_CONNECTION].get("totalCount")
+ num_tbl_cols: Optional[int] = table.get(c.COLUMNS_CONNECTION) and table[
+ c.COLUMNS_CONNECTION
+ ].get("totalCount")
if not is_custom_sql and not num_tbl_cols:
logger.debug(
- f"Skipping upstream table with id {table[tableau_constant.ID]}, no columns: {table}"
+ f"Skipping upstream table with id {table[c.ID]}, no columns: {table}"
)
continue
- elif table[tableau_constant.NAME] is None:
+ elif table[c.NAME] is None:
self.report.num_upstream_table_skipped_no_name += 1
logger.warning(
- f"Skipping upstream table {table[tableau_constant.ID]} from lineage since its name is none: {table}"
+ f"Skipping upstream table {table[c.ID]} from lineage since its name is none: {table}"
)
continue
@@ -1014,7 +1000,7 @@ def get_upstream_tables(
self.config.platform_instance_map,
self.config.lineage_overrides,
)
- table_id_to_urn[table[tableau_constant.ID]] = table_urn
+ table_id_to_urn[table[c.ID]] = table_urn
upstream_table = Upstream(
dataset=table_urn,
@@ -1029,13 +1015,13 @@ def get_upstream_tables(
if table_urn not in self.database_tables:
self.database_tables[table_urn] = DatabaseTable(
urn=table_urn,
- id=table[tableau_constant.ID],
+ id=table[c.ID],
num_cols=num_tbl_cols,
paths={table_path} if table_path else set(),
)
else:
self.database_tables[table_urn].update_table(
- table[tableau_constant.ID], num_tbl_cols, table_path
+ table[c.ID], num_tbl_cols, table_path
)
return upstream_tables, table_id_to_urn
@@ -1047,24 +1033,24 @@ def get_upstream_columns_of_fields_in_datasource(
table_id_to_urn: Dict[str, str],
) -> List[FineGrainedLineage]:
fine_grained_lineages = []
- for field in datasource.get(tableau_constant.FIELDS) or []:
- field_name = field.get(tableau_constant.NAME)
+ for field in datasource.get(c.FIELDS) or []:
+ field_name = field.get(c.NAME)
# upstreamColumns lineage will be set via upstreamFields.
# such as for CalculatedField
if (
not field_name
- or not field.get(tableau_constant.UPSTREAM_COLUMNS)
- or field.get(tableau_constant.UPSTREAM_FIELDS)
+ or not field.get(c.UPSTREAM_COLUMNS)
+ or field.get(c.UPSTREAM_FIELDS)
):
continue
input_columns = []
- for upstream_col in field.get(tableau_constant.UPSTREAM_COLUMNS):
+ for upstream_col in field.get(c.UPSTREAM_COLUMNS):
if not upstream_col:
continue
- name = upstream_col.get(tableau_constant.NAME)
+ name = upstream_col.get(c.NAME)
upstream_table_id = (
- upstream_col.get(tableau_constant.TABLE)[tableau_constant.ID]
- if upstream_col.get(tableau_constant.TABLE)
+ upstream_col.get(c.TABLE)[c.ID]
+ if upstream_col.get(c.TABLE)
else None
)
if (
@@ -1110,23 +1096,21 @@ def get_upstream_fields_of_field_in_datasource(
self, datasource: dict, datasource_urn: str
) -> List[FineGrainedLineage]:
fine_grained_lineages = []
- for field in datasource.get(tableau_constant.FIELDS) or []:
- field_name = field.get(tableau_constant.NAME)
+ for field in datasource.get(c.FIELDS) or []:
+ field_name = field.get(c.NAME)
# It is observed that upstreamFields gives one-hop field
# lineage, and not multi-hop field lineage
# This behavior is as desired in our case.
- if not field_name or not field.get(tableau_constant.UPSTREAM_FIELDS):
+ if not field_name or not field.get(c.UPSTREAM_FIELDS):
continue
input_fields = []
- for upstream_field in field.get(tableau_constant.UPSTREAM_FIELDS):
+ for upstream_field in field.get(c.UPSTREAM_FIELDS):
if not upstream_field:
continue
- name = upstream_field.get(tableau_constant.NAME)
+ name = upstream_field.get(c.NAME)
upstream_ds_id = (
- upstream_field.get(tableau_constant.DATA_SOURCE)[
- tableau_constant.ID
- ]
- if upstream_field.get(tableau_constant.DATA_SOURCE)
+ upstream_field.get(c.DATA_SOURCE)[c.ID]
+ if upstream_field.get(c.DATA_SOURCE)
else None
)
if name and upstream_ds_id:
@@ -1212,35 +1196,37 @@ def get_upstream_fields_from_custom_sql(
return fine_grained_lineages
def get_transform_operation(self, field: dict) -> str:
- field_type = field[tableau_constant.TYPE_NAME]
+ field_type = field[c.TYPE_NAME]
if field_type in (
- tableau_constant.DATA_SOURCE_FIELD,
- tableau_constant.COLUMN_FIELD,
+ c.DATA_SOURCE_FIELD,
+ c.COLUMN_FIELD,
):
- op = tableau_constant.IDENTITY # How to specify exact same
- elif field_type == tableau_constant.CALCULATED_FIELD:
+ op = c.IDENTITY # How to specify exact same
+ elif field_type == c.CALCULATED_FIELD:
op = field_type
- if field.get(tableau_constant.FORMULA):
- op += f"formula: {field.get(tableau_constant.FORMULA)}"
+ if field.get(c.FORMULA):
+ op += f"formula: {field.get(c.FORMULA)}"
else:
op = field_type # BinField, CombinedField, etc
return op
def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
- custom_sql_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.custom_sql_ids_being_used)}"
+ custom_sql_filter = (
+ f"{c.ID_WITH_IN}: {json.dumps(self.custom_sql_ids_being_used)}"
+ )
custom_sql_connection = list(
self.get_connection_objects(
custom_sql_graphql_query,
- tableau_constant.CUSTOM_SQL_TABLE_CONNECTION,
+ c.CUSTOM_SQL_TABLE_CONNECTION,
custom_sql_filter,
)
)
unique_custom_sql = get_unique_custom_sql(custom_sql_connection)
for csql in unique_custom_sql:
- csql_id: str = csql[tableau_constant.ID]
+ csql_id: str = csql[c.ID]
csql_urn = builder.make_dataset_urn_with_platform_instance(
platform=self.platform,
name=csql_id,
@@ -1256,40 +1242,33 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
datasource_name = None
project = None
- if len(csql[tableau_constant.DATA_SOURCES]) > 0:
+ if len(csql[c.DATA_SOURCES]) > 0:
# CustomSQLTable id owned by exactly one tableau data source
logger.debug(
- f"Number of datasources referencing CustomSQLTable: {len(csql[tableau_constant.DATA_SOURCES])}"
+ f"Number of datasources referencing CustomSQLTable: {len(csql[c.DATA_SOURCES])}"
)
- datasource = csql[tableau_constant.DATA_SOURCES][0]
- datasource_name = datasource.get(tableau_constant.NAME)
+ datasource = csql[c.DATA_SOURCES][0]
+ datasource_name = datasource.get(c.NAME)
if datasource.get(
- tableau_constant.TYPE_NAME
- ) == tableau_constant.EMBEDDED_DATA_SOURCE and datasource.get(
- tableau_constant.WORKBOOK
- ):
+ c.TYPE_NAME
+ ) == c.EMBEDDED_DATA_SOURCE and datasource.get(c.WORKBOOK):
datasource_name = (
- f"{datasource.get(tableau_constant.WORKBOOK).get(tableau_constant.NAME)}/{datasource_name}"
- if datasource_name
- and datasource.get(tableau_constant.WORKBOOK).get(
- tableau_constant.NAME
- )
+ f"{datasource.get(c.WORKBOOK).get(c.NAME)}/{datasource_name}"
+ if datasource_name and datasource.get(c.WORKBOOK).get(c.NAME)
else None
)
logger.debug(
f"Adding datasource {datasource_name}({datasource.get('id')}) to container"
)
yield from add_entity_to_container(
- self.gen_workbook_key(
- datasource[tableau_constant.WORKBOOK][tableau_constant.ID]
- ),
- tableau_constant.DATASET,
+ self.gen_workbook_key(datasource[c.WORKBOOK][c.ID]),
+ c.DATASET,
dataset_snapshot.urn,
)
project = self._get_project_browse_path_name(datasource)
- tables = csql.get(tableau_constant.TABLES, [])
+ tables = csql.get(c.TABLES, [])
if tables:
# lineage from custom sql -> datasets/tables #
@@ -1306,9 +1285,8 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
# Schema Metadata
# if condition is needed as graphQL return "cloumns": None
columns: List[Dict[Any, Any]] = (
- cast(List[Dict[Any, Any]], csql.get(tableau_constant.COLUMNS))
- if tableau_constant.COLUMNS in csql
- and csql.get(tableau_constant.COLUMNS) is not None
+ cast(List[Dict[Any, Any]], csql.get(c.COLUMNS))
+ if c.COLUMNS in csql and csql.get(c.COLUMNS) is not None
else []
)
schema_metadata = self.get_schema_metadata_for_custom_sql(columns)
@@ -1320,7 +1298,7 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
if project and datasource_name:
browse_paths = BrowsePathsClass(
paths=[
- f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource[tableau_constant.NAME]}"
+ f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource[c.NAME]}"
]
)
dataset_snapshot.aspects.append(browse_paths)
@@ -1328,27 +1306,25 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]:
logger.debug(f"Browse path not set for Custom SQL table {csql_id}")
dataset_properties = DatasetPropertiesClass(
- name=csql.get(tableau_constant.NAME),
- description=csql.get(tableau_constant.DESCRIPTION),
+ name=csql.get(c.NAME),
+ description=csql.get(c.DESCRIPTION),
)
dataset_snapshot.aspects.append(dataset_properties)
- if csql.get(tableau_constant.QUERY):
+ if csql.get(c.QUERY):
view_properties = ViewPropertiesClass(
materialized=False,
- viewLanguage=tableau_constant.SQL,
- viewLogic=clean_query(csql[tableau_constant.QUERY]),
+ viewLanguage=c.SQL,
+ viewLogic=clean_query(csql[c.QUERY]),
)
dataset_snapshot.aspects.append(view_properties)
yield self.get_metadata_change_event(dataset_snapshot)
yield self.get_metadata_change_proposal(
dataset_snapshot.urn,
- aspect_name=tableau_constant.SUB_TYPES,
- aspect=SubTypesClass(
- typeNames=[DatasetSubTypes.VIEW, tableau_constant.CUSTOM_SQL]
- ),
+ aspect_name=c.SUB_TYPES,
+ aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]),
)
def get_schema_metadata_for_custom_sql(
@@ -1359,21 +1335,19 @@ def get_schema_metadata_for_custom_sql(
for field in columns:
# Datasource fields
- if field.get(tableau_constant.NAME) is None:
+ if field.get(c.NAME) is None:
self.report.num_csql_field_skipped_no_name += 1
logger.warning(
- f"Skipping field {field[tableau_constant.ID]} from schema since its name is none"
+ f"Skipping field {field[c.ID]} from schema since its name is none"
)
continue
- nativeDataType = field.get(
- tableau_constant.REMOTE_TYPE, tableau_constant.UNKNOWN
- )
+ nativeDataType = field.get(c.REMOTE_TYPE, c.UNKNOWN)
TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)
schema_field = SchemaField(
- fieldPath=field[tableau_constant.NAME],
+ fieldPath=field[c.NAME],
type=SchemaFieldDataType(type=TypeClass()),
nativeDataType=nativeDataType,
- description=field.get(tableau_constant.DESCRIPTION),
+ description=field.get(c.DESCRIPTION),
)
fields.append(schema_field)
@@ -1391,28 +1365,25 @@ def _get_published_datasource_project_luid(self, ds: dict) -> Optional[str]:
# This is fallback in case "get all datasources" query fails for some reason.
# It is possible due to https://github.com/tableau/server-client-python/issues/1210
if (
- ds.get(tableau_constant.LUID)
- and ds[tableau_constant.LUID] not in self.datasource_project_map.keys()
+ ds.get(c.LUID)
+ and ds[c.LUID] not in self.datasource_project_map.keys()
and self.report.get_all_datasources_query_failed
):
logger.debug(
- f"published datasource {ds.get(tableau_constant.NAME)} project_luid not found."
- f" Running get datasource query for {ds[tableau_constant.LUID]}"
+ f"published datasource {ds.get(c.NAME)} project_luid not found."
+ f" Running get datasource query for {ds[c.LUID]}"
)
# Query and update self.datasource_project_map with luid
- self._query_published_datasource_for_project_luid(ds[tableau_constant.LUID])
+ self._query_published_datasource_for_project_luid(ds[c.LUID])
if (
- ds.get(tableau_constant.LUID)
- and ds[tableau_constant.LUID] in self.datasource_project_map.keys()
- and self.datasource_project_map[ds[tableau_constant.LUID]]
- in self.tableau_project_registry
+ ds.get(c.LUID)
+ and ds[c.LUID] in self.datasource_project_map.keys()
+ and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry
):
- return self.datasource_project_map[ds[tableau_constant.LUID]]
+ return self.datasource_project_map[ds[c.LUID]]
- logger.debug(
- f"published datasource {ds.get(tableau_constant.NAME)} project_luid not found"
- )
+ logger.debug(f"published datasource {ds.get(c.NAME)} project_luid not found")
return None
@@ -1437,60 +1408,52 @@ def _query_published_datasource_for_project_luid(self, ds_luid: str) -> None:
logger.debug("Error stack trace", exc_info=True)
def _get_workbook_project_luid(self, wb: dict) -> Optional[str]:
- if wb.get(tableau_constant.LUID) and self.workbook_project_map.get(
- wb[tableau_constant.LUID]
- ):
- return self.workbook_project_map[wb[tableau_constant.LUID]]
+ if wb.get(c.LUID) and self.workbook_project_map.get(wb[c.LUID]):
+ return self.workbook_project_map[wb[c.LUID]]
- logger.debug(f"workbook {wb.get(tableau_constant.NAME)} project_luid not found")
+ logger.debug(f"workbook {wb.get(c.NAME)} project_luid not found")
return None
def _get_embedded_datasource_project_luid(self, ds: dict) -> Optional[str]:
- if ds.get(tableau_constant.WORKBOOK):
+ if ds.get(c.WORKBOOK):
project_luid: Optional[str] = self._get_workbook_project_luid(
- ds[tableau_constant.WORKBOOK]
+ ds[c.WORKBOOK]
)
if project_luid and project_luid in self.tableau_project_registry:
return project_luid
- logger.debug(
- f"embedded datasource {ds.get(tableau_constant.NAME)} project_luid not found"
- )
+ logger.debug(f"embedded datasource {ds.get(c.NAME)} project_luid not found")
return None
def _get_datasource_project_luid(self, ds: dict) -> Optional[str]:
# Only published and embedded data-sources are supported
- ds_type: Optional[str] = ds.get(tableau_constant.TYPE_NAME)
+ ds_type: Optional[str] = ds.get(c.TYPE_NAME)
if ds_type not in (
- tableau_constant.PUBLISHED_DATA_SOURCE,
- tableau_constant.EMBEDDED_DATA_SOURCE,
+ c.PUBLISHED_DATA_SOURCE,
+ c.EMBEDDED_DATA_SOURCE,
):
logger.debug(
- f"datasource {ds.get(tableau_constant.NAME)} type {ds.get(tableau_constant.TYPE_NAME)} is "
+ f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is "
f"unsupported"
)
return None
func_selector: Any = {
- tableau_constant.PUBLISHED_DATA_SOURCE: self._get_published_datasource_project_luid,
- tableau_constant.EMBEDDED_DATA_SOURCE: self._get_embedded_datasource_project_luid,
+ c.PUBLISHED_DATA_SOURCE: self._get_published_datasource_project_luid,
+ c.EMBEDDED_DATA_SOURCE: self._get_embedded_datasource_project_luid,
}
return func_selector[ds_type](ds)
@staticmethod
def _get_datasource_project_name(ds: dict) -> Optional[str]:
- if ds.get(
- tableau_constant.TYPE_NAME
- ) == tableau_constant.EMBEDDED_DATA_SOURCE and ds.get(
- tableau_constant.WORKBOOK
- ):
- return ds[tableau_constant.WORKBOOK].get(tableau_constant.PROJECT_NAME)
- if ds.get(tableau_constant.TYPE_NAME) == tableau_constant.PUBLISHED_DATA_SOURCE:
- return ds.get(tableau_constant.PROJECT_NAME)
+ if ds.get(c.TYPE_NAME) == c.EMBEDDED_DATA_SOURCE and ds.get(c.WORKBOOK):
+ return ds[c.WORKBOOK].get(c.PROJECT_NAME)
+ if ds.get(c.TYPE_NAME) == c.PUBLISHED_DATA_SOURCE:
+ return ds.get(c.PROJECT_NAME)
return None
def _get_project_browse_path_name(self, ds: dict) -> Optional[str]:
@@ -1502,7 +1465,7 @@ def _get_project_browse_path_name(self, ds: dict) -> Optional[str]:
project_luid = self._get_datasource_project_luid(ds)
if project_luid is None:
logger.warning(
- f"Could not load project hierarchy for datasource {ds.get(tableau_constant.NAME)}. Please check permissions."
+ f"Could not load project hierarchy for datasource {ds.get(c.NAME)}. Please check permissions."
)
logger.debug(f"datasource = {ds}")
return None
@@ -1515,7 +1478,7 @@ def _create_lineage_to_upstream_tables(
# This adds an edge to upstream DatabaseTables using `upstreamTables`
upstream_tables, _ = self.get_upstream_tables(
tables,
- datasource.get(tableau_constant.NAME) or "",
+ datasource.get(c.NAME) or "",
self._get_project_browse_path_name(datasource),
is_custom_sql=True,
)
@@ -1524,7 +1487,7 @@ def _create_lineage_to_upstream_tables(
upstream_lineage = UpstreamLineage(upstreams=upstream_tables)
yield self.get_metadata_change_proposal(
csql_urn,
- aspect_name=tableau_constant.UPSTREAM_LINEAGE,
+ aspect_name=c.UPSTREAM_LINEAGE,
aspect=upstream_lineage,
)
@@ -1547,22 +1510,19 @@ def parse_custom_sql(
]
],
) -> Optional["SqlParsingResult"]:
- database_info = datasource.get(tableau_constant.DATABASE) or {}
+ database_info = datasource.get(c.DATABASE) or {}
- if datasource.get(tableau_constant.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False):
+ if datasource.get(c.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False):
logger.debug(f"datasource {datasource_urn} is not created from custom sql")
return None
- if (
- tableau_constant.NAME not in database_info
- or tableau_constant.CONNECTION_TYPE not in database_info
- ):
+ if c.NAME not in database_info or c.CONNECTION_TYPE not in database_info:
logger.debug(
f"database information is missing from datasource {datasource_urn}"
)
return None
- query = datasource.get(tableau_constant.QUERY)
+ query = datasource.get(c.QUERY)
if query is None:
logger.debug(
f"raw sql query is not available for datasource {datasource_urn}"
@@ -1571,13 +1531,13 @@ def parse_custom_sql(
logger.debug(f"Parsing sql={query}")
- upstream_db = database_info.get(tableau_constant.NAME)
+ upstream_db = database_info.get(c.NAME)
if func_overridden_info is not None:
# Override the information as per configuration
upstream_db, platform_instance, platform, _ = func_overridden_info(
- database_info[tableau_constant.CONNECTION_TYPE],
- database_info.get(tableau_constant.NAME),
+ database_info[c.CONNECTION_TYPE],
+ database_info.get(c.NAME),
self.config.platform_instance_map,
self.config.lineage_overrides,
)
@@ -1631,7 +1591,7 @@ def _create_lineage_from_unsupported_csql(
yield self.get_metadata_change_proposal(
csql_urn,
- aspect_name=tableau_constant.UPSTREAM_LINEAGE,
+ aspect_name=c.UPSTREAM_LINEAGE,
aspect=upstream_lineage,
)
@@ -1642,10 +1602,10 @@ def _get_schema_metadata_for_datasource(
for field in datasource_fields:
# check datasource - custom sql relations from a field being referenced
self._track_custom_sql_ids(field)
- if field.get(tableau_constant.NAME) is None:
+ if field.get(c.NAME) is None:
self.report.num_upstream_table_skipped_no_name += 1
logger.warning(
- f"Skipping field {field[tableau_constant.ID]} from schema since its name is none"
+ f"Skipping field {field[c.ID]} from schema since its name is none"
)
continue
@@ -1678,7 +1638,7 @@ def get_metadata_change_proposal(
aspect: Union["UpstreamLineage", "SubTypesClass"],
) -> MetadataWorkUnit:
return MetadataChangeProposalWrapper(
- entityType=tableau_constant.DATASET,
+ entityType=c.DATASET,
changeType=ChangeTypeClass.UPSERT,
entityUrn=urn,
aspectName=aspect_name,
@@ -1696,10 +1656,8 @@ def emit_datasource(
datasource_info = datasource
browse_path = self._get_project_browse_path_name(datasource)
- logger.debug(
- f"datasource {datasource.get(tableau_constant.NAME)} browse-path {browse_path}"
- )
- datasource_id = datasource[tableau_constant.ID]
+ logger.debug(f"datasource {datasource.get(c.NAME)} browse-path {browse_path}")
+ datasource_id = datasource[c.ID]
datasource_urn = builder.make_dataset_urn_with_platform_instance(
self.platform, datasource_id, self.config.platform_instance, self.config.env
)
@@ -1713,13 +1671,10 @@ def emit_datasource(
# Browse path
- if (
- browse_path
- and is_embedded_ds
- and workbook
- and workbook.get(tableau_constant.NAME)
- ):
- browse_path = f"{browse_path}/{workbook[tableau_constant.NAME].replace('/', REPLACE_SLASH_CHAR)}"
+ if browse_path and is_embedded_ds and workbook and workbook.get(c.NAME):
+ browse_path = (
+ f"{browse_path}/{workbook[c.NAME].replace('/', REPLACE_SLASH_CHAR)}"
+ )
if browse_path:
browse_paths = BrowsePathsClass(
@@ -1729,12 +1684,10 @@ def emit_datasource(
# Ownership
owner = (
- self._get_ownership(
- datasource_info[tableau_constant.OWNER][tableau_constant.USERNAME]
- )
+ self._get_ownership(datasource_info[c.OWNER][c.USERNAME])
if datasource_info
- and datasource_info.get(tableau_constant.OWNER)
- and datasource_info[tableau_constant.OWNER].get(tableau_constant.USERNAME)
+ and datasource_info.get(c.OWNER)
+ and datasource_info[c.OWNER].get(c.USERNAME)
else None
)
if owner is not None:
@@ -1742,24 +1695,22 @@ def emit_datasource(
# Dataset properties
dataset_props = DatasetPropertiesClass(
- name=datasource.get(tableau_constant.NAME),
- description=datasource.get(tableau_constant.DESCRIPTION),
+ name=datasource.get(c.NAME),
+ description=datasource.get(c.DESCRIPTION),
customProperties=self.get_custom_props_from_dict(
datasource,
[
- tableau_constant.HAS_EXTRACTS,
- tableau_constant.EXTRACT_LAST_REFRESH_TIME,
- tableau_constant.EXTRACT_LAST_INCREMENTAL_UPDATE_TIME,
- tableau_constant.EXTRACT_LAST_UPDATE_TIME,
+ c.HAS_EXTRACTS,
+ c.EXTRACT_LAST_REFRESH_TIME,
+ c.EXTRACT_LAST_INCREMENTAL_UPDATE_TIME,
+ c.EXTRACT_LAST_UPDATE_TIME,
],
),
)
dataset_snapshot.aspects.append(dataset_props)
# Upstream Tables
- if datasource.get(tableau_constant.UPSTREAM_TABLES) or datasource.get(
- tableau_constant.UPSTREAM_DATA_SOURCES
- ):
+ if datasource.get(c.UPSTREAM_TABLES) or datasource.get(c.UPSTREAM_DATA_SOURCES):
# datasource -> db table relations
(
upstream_tables,
@@ -1779,13 +1730,13 @@ def emit_datasource(
)
yield self.get_metadata_change_proposal(
datasource_urn,
- aspect_name=tableau_constant.UPSTREAM_LINEAGE,
+ aspect_name=c.UPSTREAM_LINEAGE,
aspect=upstream_lineage,
)
# Datasource Fields
schema_metadata = self._get_schema_metadata_for_datasource(
- datasource.get(tableau_constant.FIELDS, [])
+ datasource.get(c.FIELDS, [])
)
if schema_metadata is not None:
dataset_snapshot.aspects.append(schema_metadata)
@@ -1793,7 +1744,7 @@ def emit_datasource(
yield self.get_metadata_change_event(dataset_snapshot)
yield self.get_metadata_change_proposal(
dataset_snapshot.urn,
- aspect_name=tableau_constant.SUB_TYPES,
+ aspect_name=c.SUB_TYPES,
aspect=SubTypesClass(
typeNames=(
["Embedded Data Source"]
@@ -1809,7 +1760,7 @@ def emit_datasource(
if container_key is not None:
yield from add_entity_to_container(
container_key,
- tableau_constant.DATASET,
+ c.DATASET,
dataset_snapshot.urn,
)
@@ -1822,10 +1773,10 @@ def _get_datasource_container_key(
container_key: Optional[ContainerKey] = None
if is_embedded_ds: # It is embedded then parent is container is workbook
if workbook is not None:
- container_key = self.gen_workbook_key(workbook[tableau_constant.ID])
+ container_key = self.gen_workbook_key(workbook[c.ID])
else:
logger.warning(
- f"Parent container not set for embedded datasource {datasource[tableau_constant.ID]}"
+ f"Parent container not set for embedded datasource {datasource[c.ID]}"
)
else:
parent_project_luid = self._get_published_datasource_project_luid(
@@ -1836,17 +1787,19 @@ def _get_datasource_container_key(
container_key = self.gen_project_key(parent_project_luid)
else:
logger.warning(
- f"Parent container not set for published datasource {datasource[tableau_constant.ID]}"
+ f"Parent container not set for published datasource {datasource[c.ID]}"
)
return container_key
def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]:
- datasource_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.datasource_ids_being_used)}"
+ datasource_filter = (
+ f"{c.ID_WITH_IN}: {json.dumps(self.datasource_ids_being_used)}"
+ )
for datasource in self.get_connection_objects(
published_datasource_graphql_query,
- tableau_constant.PUBLISHED_DATA_SOURCES_CONNECTION,
+ c.PUBLISHED_DATA_SOURCES_CONNECTION,
datasource_filter,
):
yield from self.emit_datasource(datasource)
@@ -1855,11 +1808,13 @@ def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]:
database_table_id_to_urn_map: Dict[str, str] = dict()
for urn, tbl in self.database_tables.items():
database_table_id_to_urn_map[tbl.id] = urn
- tables_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(list(database_table_id_to_urn_map.keys()))}"
+ tables_filter = (
+ f"{c.ID_WITH_IN}: {json.dumps(list(database_table_id_to_urn_map.keys()))}"
+ )
for table in self.get_connection_objects(
database_tables_graphql_query,
- tableau_constant.DATABASE_TABLES_CONNECTION,
+ c.DATABASE_TABLES_CONNECTION,
tables_filter,
):
yield from self.emit_table(table, database_table_id_to_urn_map)
@@ -1867,11 +1822,9 @@ def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]:
def emit_table(
self, table: dict, database_table_id_to_urn_map: Dict[str, str]
) -> Iterable[MetadataWorkUnit]:
- database_table = self.database_tables[
- database_table_id_to_urn_map[table[tableau_constant.ID]]
- ]
- columns = table.get(tableau_constant.COLUMNS, [])
- is_embedded = table.get(tableau_constant.IS_EMBEDDED) or False
+ database_table = self.database_tables[database_table_id_to_urn_map[table[c.ID]]]
+ columns = table.get(c.COLUMNS, [])
+ is_embedded = table.get(c.IS_EMBEDDED) or False
if not is_embedded and not self.config.ingest_tables_external:
logger.debug(
f"Skipping external table {database_table.urn} as ingest_tables_external is set to False"
@@ -1907,21 +1860,19 @@ def get_schema_metadata_for_table(
if columns:
fields = []
for field in columns:
- if field.get(tableau_constant.NAME) is None:
+ if field.get(c.NAME) is None:
self.report.num_table_field_skipped_no_name += 1
logger.warning(
- f"Skipping field {field[tableau_constant.ID]} from schema since its name is none"
+ f"Skipping field {field[c.ID]} from schema since its name is none"
)
continue
- nativeDataType = field.get(
- tableau_constant.REMOTE_TYPE, tableau_constant.UNKNOWN
- )
+ nativeDataType = field.get(c.REMOTE_TYPE, c.UNKNOWN)
TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass)
schema_field = SchemaField(
- fieldPath=field[tableau_constant.NAME],
+ fieldPath=field[c.NAME],
type=SchemaFieldDataType(type=TypeClass()),
- description=field.get(tableau_constant.DESCRIPTION),
+ description=field.get(c.DESCRIPTION),
nativeDataType=nativeDataType,
)
@@ -1941,11 +1892,9 @@ def get_schema_metadata_for_table(
def get_sheetwise_upstream_datasources(self, sheet: dict) -> set:
sheet_upstream_datasources = set()
- for field in sheet.get(tableau_constant.DATA_SOURCE_FIELDS) or []:
- if field and field.get(tableau_constant.DATA_SOURCE):
- sheet_upstream_datasources.add(
- field[tableau_constant.DATA_SOURCE][tableau_constant.ID]
- )
+ for field in sheet.get(c.DATA_SOURCE_FIELDS) or []:
+ if field and field.get(c.DATA_SOURCE):
+ sheet_upstream_datasources.add(field[c.DATA_SOURCE][c.ID])
return sheet_upstream_datasources
@@ -1961,20 +1910,20 @@ def _create_datahub_chart_usage_stat(
def _get_chart_stat_wu(
self, sheet: dict, sheet_urn: str
) -> Optional[MetadataWorkUnit]:
- luid: Optional[str] = sheet.get(tableau_constant.LUID)
+ luid: Optional[str] = sheet.get(c.LUID)
if luid is None:
logger.debug(
"stat:luid is none for sheet %s(id:%s)",
- sheet.get(tableau_constant.NAME),
- sheet.get(tableau_constant.ID),
+ sheet.get(c.NAME),
+ sheet.get(c.ID),
)
return None
usage_stat: Optional[UsageStat] = self.tableau_stat_registry.get(luid)
if usage_stat is None:
logger.debug(
"stat:UsageStat is not available in tableau_stat_registry for sheet %s(id:%s)",
- sheet.get(tableau_constant.NAME),
- sheet.get(tableau_constant.ID),
+ sheet.get(c.NAME),
+ sheet.get(c.ID),
)
return None
@@ -1983,8 +1932,8 @@ def _get_chart_stat_wu(
)
logger.debug(
"stat: Chart usage stat work unit is created for %s(id:%s)",
- sheet.get(tableau_constant.NAME),
- sheet.get(tableau_constant.ID),
+ sheet.get(c.NAME),
+ sheet.get(c.ID),
)
return MetadataChangeProposalWrapper(
aspect=aspect,
@@ -1992,22 +1941,20 @@ def _get_chart_stat_wu(
).as_workunit()
def emit_sheets(self) -> Iterable[MetadataWorkUnit]:
- sheets_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.sheet_ids)}"
+ sheets_filter = f"{c.ID_WITH_IN}: {json.dumps(self.sheet_ids)}"
for sheet in self.get_connection_objects(
sheet_graphql_query,
- tableau_constant.SHEETS_CONNECTION,
+ c.SHEETS_CONNECTION,
sheets_filter,
):
- yield from self.emit_sheets_as_charts(
- sheet, sheet.get(tableau_constant.WORKBOOK)
- )
+ yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK))
def emit_sheets_as_charts(
self, sheet: dict, workbook: Optional[Dict]
) -> Iterable[MetadataWorkUnit]:
sheet_urn: str = builder.make_chart_urn(
- self.platform, sheet[tableau_constant.ID], self.config.platform_instance
+ self.platform, sheet[c.ID], self.config.platform_instance
)
chart_snapshot = ChartSnapshot(
urn=sheet_urn,
@@ -2015,34 +1962,32 @@ def emit_sheets_as_charts(
)
creator: Optional[str] = None
- if workbook is not None and workbook.get(tableau_constant.OWNER) is not None:
- creator = workbook[tableau_constant.OWNER].get(tableau_constant.USERNAME)
- created_at = sheet.get(tableau_constant.CREATED_AT, datetime.now())
- updated_at = sheet.get(tableau_constant.UPDATED_AT, datetime.now())
+ if workbook is not None and workbook.get(c.OWNER) is not None:
+ creator = workbook[c.OWNER].get(c.USERNAME)
+ created_at = sheet.get(c.CREATED_AT, datetime.now())
+ updated_at = sheet.get(c.UPDATED_AT, datetime.now())
last_modified = self.get_last_modified(creator, created_at, updated_at)
- if sheet.get(tableau_constant.PATH):
+ if sheet.get(c.PATH):
site_part = f"/site/{self.config.site}" if self.config.site else ""
- sheet_external_url = f"{self.config.connect_uri}/#{site_part}/views/{sheet.get(tableau_constant.PATH)}"
- elif (
- sheet.get(tableau_constant.CONTAINED_IN_DASHBOARDS) is not None
- and len(sheet[tableau_constant.CONTAINED_IN_DASHBOARDS]) > 0
- and sheet[tableau_constant.CONTAINED_IN_DASHBOARDS][0] is not None
- and sheet[tableau_constant.CONTAINED_IN_DASHBOARDS][0].get(
- tableau_constant.PATH
+ sheet_external_url = (
+ f"{self.config.connect_uri}/#{site_part}/views/{sheet.get(c.PATH)}"
)
+ elif (
+ sheet.get(c.CONTAINED_IN_DASHBOARDS) is not None
+ and len(sheet[c.CONTAINED_IN_DASHBOARDS]) > 0
+ and sheet[c.CONTAINED_IN_DASHBOARDS][0] is not None
+ and sheet[c.CONTAINED_IN_DASHBOARDS][0].get(c.PATH)
):
# sheet contained in dashboard
site_part = f"/t/{self.config.site}" if self.config.site else ""
- dashboard_path = sheet[tableau_constant.CONTAINED_IN_DASHBOARDS][0][
- tableau_constant.PATH
- ]
- sheet_external_url = f"{self.config.connect_uri}{site_part}/authoring/{dashboard_path}/{sheet.get(tableau_constant.NAME, '')}"
+ dashboard_path = sheet[c.CONTAINED_IN_DASHBOARDS][0][c.PATH]
+ sheet_external_url = f"{self.config.connect_uri}{site_part}/authoring/{dashboard_path}/{sheet.get(c.NAME, '')}"
else:
# hidden or viz-in-tooltip sheet
sheet_external_url = None
input_fields: List[InputField] = []
- if sheet.get(tableau_constant.DATA_SOURCE_FIELDS):
+ if sheet.get(c.DATA_SOURCE_FIELDS):
self.populate_sheet_upstream_fields(sheet, input_fields)
# datasource urn
@@ -2060,15 +2005,13 @@ def emit_sheets_as_charts(
# Chart Info
chart_info = ChartInfoClass(
description="",
- title=sheet.get(tableau_constant.NAME) or "",
+ title=sheet.get(c.NAME) or "",
lastModified=last_modified,
externalUrl=sheet_external_url
if self.config.ingest_external_links_for_charts
else None,
inputs=sorted(datasource_urn),
- customProperties=self.get_custom_props_from_dict(
- sheet, [tableau_constant.LUID]
- ),
+ customProperties=self.get_custom_props_from_dict(sheet, [c.LUID]),
)
chart_snapshot.aspects.append(chart_info)
# chart_snapshot doesn't support the stat aspect as list element and hence need to emit MCP
@@ -2083,7 +2026,7 @@ def emit_sheets_as_charts(
chart_snapshot.aspects.append(browse_paths)
else:
logger.warning(
- f"Could not set browse path for workbook {sheet[tableau_constant.ID]}. Please check permissions."
+ f"Could not set browse path for workbook {sheet[c.ID]}. Please check permissions."
)
# Ownership
@@ -2107,9 +2050,7 @@ def emit_sheets_as_charts(
)
if workbook is not None:
yield from add_entity_to_container(
- self.gen_workbook_key(workbook[tableau_constant.ID]),
- tableau_constant.CHART,
- chart_snapshot.urn,
+ self.gen_workbook_key(workbook[c.ID]), c.CHART, chart_snapshot.urn
)
if input_fields:
@@ -2134,14 +2075,12 @@ def _get_project_path(self, project: TableauProject) -> str:
def populate_sheet_upstream_fields(
self, sheet: dict, input_fields: List[InputField]
) -> None:
- for field in sheet.get(tableau_constant.DATA_SOURCE_FIELDS): # type: ignore
+ for field in sheet.get(c.DATA_SOURCE_FIELDS): # type: ignore
if not field:
continue
- name = field.get(tableau_constant.NAME)
+ name = field.get(c.NAME)
upstream_ds_id = (
- field.get(tableau_constant.DATA_SOURCE)[tableau_constant.ID]
- if field.get(tableau_constant.DATA_SOURCE)
- else None
+ field.get(c.DATA_SOURCE)[c.ID] if field.get(c.DATA_SOURCE) else None
)
if name and upstream_ds_id:
input_fields.append(
@@ -2162,10 +2101,8 @@ def populate_sheet_upstream_fields(
)
def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUnit]:
- workbook_container_key = self.gen_workbook_key(workbook[tableau_constant.ID])
- creator = workbook.get(tableau_constant.OWNER, {}).get(
- tableau_constant.USERNAME
- )
+ workbook_container_key = self.gen_workbook_key(workbook[c.ID])
+ creator = workbook.get(c.OWNER, {}).get(c.USERNAME)
owner_urn = (
builder.make_user_urn(creator)
@@ -2191,17 +2128,17 @@ def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUni
if project_luid and project_luid in self.tableau_project_registry.keys():
parent_key = self.gen_project_key(project_luid)
else:
- workbook_id: Optional[str] = workbook.get(tableau_constant.ID)
- workbook_name: Optional[str] = workbook.get(tableau_constant.NAME)
+ workbook_id: Optional[str] = workbook.get(c.ID)
+ workbook_name: Optional[str] = workbook.get(c.NAME)
logger.warning(
f"Could not load project hierarchy for workbook {workbook_name}({workbook_id}). Please check permissions."
)
yield from gen_containers(
container_key=workbook_container_key,
- name=workbook.get(tableau_constant.NAME) or "",
+ name=workbook.get(c.NAME) or "",
parent_container_key=parent_key,
- description=workbook.get(tableau_constant.DESCRIPTION),
+ description=workbook.get(c.DESCRIPTION),
sub_types=[BIContainerSubTypes.TABLEAU_WORKBOOK],
owner_urn=owner_urn,
external_url=workbook_external_url,
@@ -2237,20 +2174,20 @@ def _create_datahub_dashboard_usage_stat(
def _get_dashboard_stat_wu(
self, dashboard: dict, dashboard_urn: str
) -> Optional[MetadataWorkUnit]:
- luid: Optional[str] = dashboard.get(tableau_constant.LUID)
+ luid: Optional[str] = dashboard.get(c.LUID)
if luid is None:
logger.debug(
"stat:luid is none for dashboard %s(id:%s)",
- dashboard.get(tableau_constant.NAME),
- dashboard.get(tableau_constant.ID),
+ dashboard.get(c.NAME),
+ dashboard.get(c.ID),
)
return None
usage_stat: Optional[UsageStat] = self.tableau_stat_registry.get(luid)
if usage_stat is None:
logger.debug(
"stat:UsageStat is not available in tableau_stat_registry for dashboard %s(id:%s)",
- dashboard.get(tableau_constant.NAME),
- dashboard.get(tableau_constant.ID),
+ dashboard.get(c.NAME),
+ dashboard.get(c.ID),
)
return None
@@ -2259,8 +2196,8 @@ def _get_dashboard_stat_wu(
)
logger.debug(
"stat: Dashboard usage stat is created for %s(id:%s)",
- dashboard.get(tableau_constant.NAME),
- dashboard.get(tableau_constant.ID),
+ dashboard.get(c.NAME),
+ dashboard.get(c.ID),
)
return MetadataChangeProposalWrapper(
@@ -2288,26 +2225,20 @@ def new_work_unit(self, mcp: MetadataChangeProposalWrapper) -> MetadataWorkUnit:
)
def emit_dashboards(self) -> Iterable[MetadataWorkUnit]:
- dashboards_filter = (
- f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.dashboard_ids)}"
- )
+ dashboards_filter = f"{c.ID_WITH_IN}: {json.dumps(self.dashboard_ids)}"
for dashboard in self.get_connection_objects(
dashboard_graphql_query,
- tableau_constant.DASHBOARDS_CONNECTION,
+ c.DASHBOARDS_CONNECTION,
dashboards_filter,
):
- yield from self.emit_dashboard(
- dashboard, dashboard.get(tableau_constant.WORKBOOK)
- )
+ yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK))
def get_tags(self, obj: dict) -> Optional[List[str]]:
- tag_list = obj.get(tableau_constant.TAGS, [])
+ tag_list = obj.get(c.TAGS, [])
if tag_list and self.config.ingest_tags:
tag_list_str = [
- t[tableau_constant.NAME]
- for t in tag_list
- if t is not None and t.get(tableau_constant.NAME)
+ t[c.NAME] for t in tag_list if t is not None and t.get(c.NAME)
]
return tag_list_str
@@ -2317,7 +2248,7 @@ def emit_dashboard(
self, dashboard: dict, workbook: Optional[Dict]
) -> Iterable[MetadataWorkUnit]:
dashboard_urn: str = builder.make_dashboard_urn(
- self.platform, dashboard[tableau_constant.ID], self.config.platform_instance
+ self.platform, dashboard[c.ID], self.config.platform_instance
)
dashboard_snapshot = DashboardSnapshot(
urn=dashboard_urn,
@@ -2325,26 +2256,28 @@ def emit_dashboard(
)
creator: Optional[str] = None
- if workbook is not None and workbook.get(tableau_constant.OWNER) is not None:
- creator = workbook[tableau_constant.OWNER].get(tableau_constant.USERNAME)
- created_at = dashboard.get(tableau_constant.CREATED_AT, datetime.now())
- updated_at = dashboard.get(tableau_constant.UPDATED_AT, datetime.now())
+ if workbook is not None and workbook.get(c.OWNER) is not None:
+ creator = workbook[c.OWNER].get(c.USERNAME)
+ created_at = dashboard.get(c.CREATED_AT, datetime.now())
+ updated_at = dashboard.get(c.UPDATED_AT, datetime.now())
last_modified = self.get_last_modified(creator, created_at, updated_at)
site_part = f"/site/{self.config.site}" if self.config.site else ""
- dashboard_external_url = f"{self.config.connect_uri}/#{site_part}/views/{dashboard.get(tableau_constant.PATH, '')}"
+ dashboard_external_url = (
+ f"{self.config.connect_uri}/#{site_part}/views/{dashboard.get(c.PATH, '')}"
+ )
title = (
- dashboard[tableau_constant.NAME].replace("/", REPLACE_SLASH_CHAR)
- if dashboard.get(tableau_constant.NAME)
+ dashboard[c.NAME].replace("/", REPLACE_SLASH_CHAR)
+ if dashboard.get(c.NAME)
else ""
)
chart_urns = [
builder.make_chart_urn(
self.platform,
- sheet.get(tableau_constant.ID),
+ sheet.get(c.ID),
self.config.platform_instance,
)
- for sheet in dashboard.get(tableau_constant.SHEETS, [])
+ for sheet in dashboard.get(c.SHEETS, [])
]
dashboard_info_class = DashboardInfoClass(
description="",
@@ -2354,9 +2287,7 @@ def emit_dashboard(
dashboardUrl=dashboard_external_url
if self.config.ingest_external_links_for_dashboards
else None,
- customProperties=self.get_custom_props_from_dict(
- dashboard, [tableau_constant.LUID]
- ),
+ customProperties=self.get_custom_props_from_dict(dashboard, [c.LUID]),
)
dashboard_snapshot.aspects.append(dashboard_info_class)
@@ -2377,7 +2308,7 @@ def emit_dashboard(
dashboard_snapshot.aspects.append(browse_paths)
else:
logger.warning(
- f"Could not set browse path for dashboard {dashboard[tableau_constant.ID]}. Please check permissions."
+ f"Could not set browse path for dashboard {dashboard[c.ID]}. Please check permissions."
)
# Ownership
@@ -2397,8 +2328,8 @@ def emit_dashboard(
if workbook is not None:
yield from add_entity_to_container(
- self.gen_workbook_key(workbook[tableau_constant.ID]),
- tableau_constant.DASHBOARD,
+ self.gen_workbook_key(workbook[c.ID]),
+ c.DASHBOARD,
dashboard_snapshot.urn,
)
@@ -2406,38 +2337,40 @@ def get_browse_paths_aspect(
self, workbook: Optional[Dict]
) -> Optional[BrowsePathsClass]:
browse_paths: Optional[BrowsePathsClass] = None
- if workbook and workbook.get(tableau_constant.NAME):
+ if workbook and workbook.get(c.NAME):
project_luid: Optional[str] = self._get_workbook_project_luid(workbook)
if project_luid in self.tableau_project_registry:
browse_paths = BrowsePathsClass(
paths=[
f"/{self.platform}/{self._project_luid_to_browse_path_name(project_luid)}"
- f"/{workbook[tableau_constant.NAME].replace('/', REPLACE_SLASH_CHAR)}"
+ f"/{workbook[c.NAME].replace('/', REPLACE_SLASH_CHAR)}"
]
)
- elif workbook.get(tableau_constant.PROJECT_NAME):
+ elif workbook.get(c.PROJECT_NAME):
# browse path
browse_paths = BrowsePathsClass(
paths=[
- f"/{self.platform}/{workbook[tableau_constant.PROJECT_NAME].replace('/', REPLACE_SLASH_CHAR)}"
- f"/{workbook[tableau_constant.NAME].replace('/', REPLACE_SLASH_CHAR)}"
+ f"/{self.platform}/{workbook[c.PROJECT_NAME].replace('/', REPLACE_SLASH_CHAR)}"
+ f"/{workbook[c.NAME].replace('/', REPLACE_SLASH_CHAR)}"
]
)
return browse_paths
def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]:
- datasource_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.embedded_datasource_ids_being_used)}"
+ datasource_filter = (
+ f"{c.ID_WITH_IN}: {json.dumps(self.embedded_datasource_ids_being_used)}"
+ )
for datasource in self.get_connection_objects(
embedded_datasource_graphql_query,
- tableau_constant.EMBEDDED_DATA_SOURCES_CONNECTION,
+ c.EMBEDDED_DATA_SOURCES_CONNECTION,
datasource_filter,
):
yield from self.emit_datasource(
datasource,
- datasource.get(tableau_constant.WORKBOOK),
+ datasource.get(c.WORKBOOK),
is_embedded_ds=True,
)
@@ -2483,7 +2416,7 @@ def emit_project_containers(self) -> Iterable[MetadataWorkUnit]:
container_key=self.gen_project_key(_id),
name=project.name,
description=project.description,
- sub_types=[tableau_constant.PROJECT],
+ sub_types=[c.PROJECT],
parent_container_key=self.gen_project_key(project.parent_id)
if project.parent_id
else None,
@@ -2498,7 +2431,7 @@ def emit_project_containers(self) -> Iterable[MetadataWorkUnit]:
yield from gen_containers(
container_key=self.gen_project_key(project.parent_id),
name=cast(str, project.parent_name),
- sub_types=[tableau_constant.PROJECT],
+ sub_types=[c.PROJECT],
)
def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]:
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py
index 7c4852042ce7c8..65d779b7f4516d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py
@@ -8,7 +8,7 @@
import datahub.emitter.mce_builder as builder
from datahub.configuration.common import ConfigModel
-from datahub.ingestion.source import tableau_constant as tc
+from datahub.ingestion.source import tableau_constant as c
from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
DatasetLineageType,
FineGrainedLineage,
@@ -591,12 +591,12 @@ def create(
cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None
) -> "TableauUpstreamReference":
# Values directly from `table` object from Tableau
- database = t_database = d.get(tc.DATABASE, {}).get(tc.NAME)
- schema = t_schema = d.get(tc.SCHEMA)
- table = t_table = d.get(tc.NAME) or ""
- t_full_name = d.get(tc.FULL_NAME)
- t_connection_type = d[tc.CONNECTION_TYPE] # required to generate urn
- t_id = d[tc.ID]
+ database = t_database = d.get(c.DATABASE, {}).get(c.NAME)
+ schema = t_schema = d.get(c.SCHEMA)
+ table = t_table = d.get(c.NAME) or ""
+ t_full_name = d.get(c.FULL_NAME)
+ t_connection_type = d[c.CONNECTION_TYPE] # required to generate urn
+ t_id = d[c.ID]
parsed_full_name = cls.parse_full_name(t_full_name)
if parsed_full_name and len(parsed_full_name) == 3:
From 9174301719122c2597db75c8bb6b60c4d1a74f77 Mon Sep 17 00:00:00 2001
From: sachinsaju <33017477+sachinsaju@users.noreply.github.com>
Date: Thu, 9 Nov 2023 10:37:09 +0530
Subject: [PATCH 32/33] docs: update broken link in metadata-modelling (#9184)
Co-authored-by: Hyejin Yoon <0327jane@gmail.com>
Co-authored-by: John Joyce
---
docs/modeling/metadata-model.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md
index a8958985a0a724..4c97cadc88417e 100644
--- a/docs/modeling/metadata-model.md
+++ b/docs/modeling/metadata-model.md
@@ -625,7 +625,7 @@ curl --location --request POST 'http://localhost:8080/analytics?action=getTimese
}
}
```
-For more examples on the complex types of group-by/aggregations, refer to the tests in the group `getAggregatedStats` of [ElasticSearchTimeseriesAspectServiceTest.java](https://github.com/datahub-project/datahub/blob/master/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java).
+For more examples on the complex types of group-by/aggregations, refer to the tests in the group `getAggregatedStats` of [TimeseriesAspectServiceTestBase.java](https://github.com/datahub-project/datahub/blob/master/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java).
From e494a9cc102f863bc51fcf80674bd6d3d36d726c Mon Sep 17 00:00:00 2001
From: Kos Korchak <97058061+kkorchak@users.noreply.github.com>
Date: Thu, 9 Nov 2023 00:23:17 -0500
Subject: [PATCH 33/33] test(): Test policy to create and manage privileges
(#9173)
---
.../tests/privileges/test_privileges.py | 112 +++++++++++++++++-
1 file changed, 111 insertions(+), 1 deletion(-)
diff --git a/smoke-test/tests/privileges/test_privileges.py b/smoke-test/tests/privileges/test_privileges.py
index 740311754678ef..d0f00734ae9f37 100644
--- a/smoke-test/tests/privileges/test_privileges.py
+++ b/smoke-test/tests/privileges/test_privileges.py
@@ -114,6 +114,21 @@ def _ensure_can_create_access_token(session, json):
assert ingestion_data["data"]["createAccessToken"]["__typename"] == "AccessToken"
+@tenacity.retry(
+ stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec)
+)
+def _ensure_can_create_user_policy(session, json):
+ response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json)
+ response.raise_for_status()
+ res_data = response.json()
+
+ assert res_data
+ assert res_data["data"]
+ assert res_data["data"]["createPolicy"] is not None
+
+ return res_data["data"]["createPolicy"]
+
+
@pytest.mark.dependency(depends=["test_healthchecks"])
def test_privilege_to_create_and_manage_secrets():
@@ -337,4 +352,99 @@ def test_privilege_to_create_and_manage_access_tokens():
# Ensure that user can't create access token after policy is removed
- _ensure_cant_perform_action(user_session, create_access_token,"createAccessToken")
\ No newline at end of file
+ _ensure_cant_perform_action(user_session, create_access_token,"createAccessToken")
+
+
+@pytest.mark.dependency(depends=["test_healthchecks"])
+def test_privilege_to_create_and_manage_policies():
+
+ (admin_user, admin_pass) = get_admin_credentials()
+ admin_session = login_as(admin_user, admin_pass)
+ user_session = login_as("user", "user")
+
+
+ # Verify new user can't create a policy
+ create_policy = {
+ "query": """mutation createPolicy($input: PolicyUpdateInput!) {\n
+ createPolicy(input: $input) }""",
+ "variables": {
+ "input": {
+ "type": "PLATFORM",
+ "name": "Policy Name",
+ "description": "Policy Description",
+ "state": "ACTIVE",
+ "resources": {"filter":{"criteria":[]}},
+ "privileges": ["MANAGE_POLICIES"],
+ "actors": {
+ "users": [],
+ "resourceOwners": False,
+ "allUsers": True,
+ "allGroups": False,
+ },
+ }
+ },
+ }
+
+ _ensure_cant_perform_action(user_session, create_policy,"createPolicy")
+
+
+ # Assign privileges to the new user to create and manage policies
+ admin_policy_urn = create_user_policy("urn:li:corpuser:user", ["MANAGE_POLICIES"], admin_session)
+
+
+ # Verify new user can create and manage policy(create, edit, delete)
+ # Create a policy
+ user_policy_urn = _ensure_can_create_user_policy(user_session, create_policy)
+
+ # Edit a policy
+ edit_policy = {
+ "query": """mutation updatePolicy($urn: String!, $input: PolicyUpdateInput!) {\n
+ updatePolicy(urn: $urn, input: $input) }""",
+ "variables": {
+ "urn": user_policy_urn,
+ "input": {
+ "type": "PLATFORM",
+ "state": "INACTIVE",
+ "name": "Policy Name test",
+ "description": "Policy Description updated",
+ "privileges": ["MANAGE_POLICIES"],
+ "actors": {
+ "users": [],
+ "groups": None,
+ "resourceOwners": False,
+ "allUsers": True,
+ "allGroups": False,
+ "resourceOwnersTypes": None,
+ },
+ },
+ },
+ }
+ edit_policy_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=edit_policy)
+ edit_policy_response.raise_for_status()
+ res_data = edit_policy_response.json()
+
+ assert res_data
+ assert res_data["data"]
+ assert res_data["data"]["updatePolicy"] == user_policy_urn
+
+ # Delete a policy
+ remove_user_policy = {
+ "query": "mutation deletePolicy($urn: String!) {\n deletePolicy(urn: $urn)\n}\n",
+ "variables":{"urn":user_policy_urn}
+ }
+
+ remove_policy_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=remove_user_policy)
+ remove_policy_response.raise_for_status()
+ res_data = remove_policy_response.json()
+
+ assert res_data
+ assert res_data["data"]
+ assert res_data["data"]["deletePolicy"] == user_policy_urn
+
+
+ # Remove the user privilege by admin
+ remove_policy(admin_policy_urn, admin_session)
+
+
+ # Ensure that user can't create a policy after privilege is removed by admin
+ _ensure_cant_perform_action(user_session, create_policy,"createPolicy")
\ No newline at end of file