diff --git a/.github/workflows/docker-ingestion-smoke.yml b/.github/workflows/docker-ingestion-smoke.yml index 8d52c237928577..82b57d23609a56 100644 --- a/.github/workflows/docker-ingestion-smoke.yml +++ b/.github/workflows/docker-ingestion-smoke.yml @@ -47,6 +47,7 @@ jobs: name: Build and Push Docker Image to Docker Hub runs-on: ubuntu-latest needs: setup + if: ${{ needs.setup.outputs.publish == 'true' }} steps: - name: Check out the repo uses: actions/checkout@v3 diff --git a/build.gradle b/build.gradle index aa2e9594447e4c..54802917d05a52 100644 --- a/build.gradle +++ b/build.gradle @@ -1,7 +1,7 @@ buildscript { ext.junitJupiterVersion = '5.6.1' // Releases: https://github.com/linkedin/rest.li/blob/master/CHANGELOG.md - ext.pegasusVersion = '29.45.0' + ext.pegasusVersion = '29.46.8' ext.mavenVersion = '3.6.3' ext.springVersion = '5.3.29' ext.springBootVersion = '2.7.14' @@ -216,7 +216,7 @@ project.ext.externalDependency = [ 'testContainersOpenSearch': 'org.opensearch:opensearch-testcontainers:2.0.0', 'typesafeConfig':'com.typesafe:config:1.4.1', 'wiremock':'com.github.tomakehurst:wiremock:2.10.0', - 'zookeeper': 'org.apache.zookeeper:zookeeper:3.4.14', + 'zookeeper': 'org.apache.zookeeper:zookeeper:3.7.2', 'wire': 'com.squareup.wire:wire-compiler:3.7.1', 'charle': 'com.charleskorn.kaml:kaml:0.53.0', 'common': 'commons-io:commons-io:2.7', diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index b99f712034fe03..b0b26f073876c4 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -1433,6 +1433,10 @@ private void configureChartResolvers(final RuntimeWiring.Builder builder) { .dataFetcher("statsSummary", new ChartStatsSummaryResolver(this.timeseriesAspectService)) .dataFetcher("privileges", new EntityPrivilegesResolver(entityClient)) .dataFetcher("exists", new EntityExistsResolver(entityService)) + .dataFetcher("subTypes", new SubTypesResolver( + this.entityClient, + "chart", + "subTypes")) ); builder.type("ChartInfo", typeWiring -> typeWiring .dataFetcher("inputs", new LoadableTypeBatchResolver<>(datasetType, diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/ListPoliciesResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/ListPoliciesResolver.java index 516d6fa2d31372..b44da1c2f832c6 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/ListPoliciesResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/ListPoliciesResolver.java @@ -40,23 +40,15 @@ public CompletableFuture get(final DataFetchingEnvironment e final Integer count = input.getCount() == null ? DEFAULT_COUNT : input.getCount(); final String query = input.getQuery() == null ? DEFAULT_QUERY : input.getQuery(); - return CompletableFuture.supplyAsync(() -> { - try { - // First, get all policy Urns. - final PolicyFetcher.PolicyFetchResult policyFetchResult = - _policyFetcher.fetchPolicies(start, count, query, context.getAuthentication()); - - // Now that we have entities we can bind this to a result. - final ListPoliciesResult result = new ListPoliciesResult(); - result.setStart(start); - result.setCount(count); - result.setTotal(policyFetchResult.getTotal()); - result.setPolicies(mapEntities(policyFetchResult.getPolicies())); - return result; - } catch (Exception e) { - throw new RuntimeException("Failed to list policies", e); - } - }); + return _policyFetcher.fetchPolicies(start, query, count, context.getAuthentication()) + .thenApply(policyFetchResult -> { + final ListPoliciesResult result = new ListPoliciesResult(); + result.setStart(start); + result.setCount(count); + result.setTotal(policyFetchResult.getTotal()); + result.setPolicies(mapEntities(policyFetchResult.getPolicies())); + return result; + }); } throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator."); } diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index b37a8f34fa0563..035f756a10d557 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -5249,6 +5249,11 @@ type Chart implements EntityWithRelationships & Entity & BrowsableEntity { Whether or not this entity exists on DataHub """ exists: Boolean + + """ + Sub Types that this entity implements + """ + subTypes: SubTypes } """ diff --git a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/entity/steps/BackfillBrowsePathsV2Step.java b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/entity/steps/BackfillBrowsePathsV2Step.java index 7547186ccfb230..08a752d9597f42 100644 --- a/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/entity/steps/BackfillBrowsePathsV2Step.java +++ b/datahub-upgrade/src/main/java/com/linkedin/datahub/upgrade/system/entity/steps/BackfillBrowsePathsV2Step.java @@ -6,6 +6,7 @@ import com.linkedin.common.BrowsePathsV2; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.template.StringArray; import com.linkedin.datahub.upgrade.UpgradeContext; import com.linkedin.datahub.upgrade.UpgradeStep; import com.linkedin.datahub.upgrade.UpgradeStepResult; @@ -13,6 +14,7 @@ import com.linkedin.events.metadata.ChangeType; import com.linkedin.metadata.Constants; import com.linkedin.metadata.entity.EntityService; +import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.ConjunctiveCriterion; import com.linkedin.metadata.query.filter.ConjunctiveCriterionArray; @@ -37,6 +39,8 @@ public class BackfillBrowsePathsV2Step implements UpgradeStep { public static final String BACKFILL_BROWSE_PATHS_V2 = "BACKFILL_BROWSE_PATHS_V2"; + public static final String REPROCESS_DEFAULT_BROWSE_PATHS_V2 = "REPROCESS_DEFAULT_BROWSE_PATHS_V2"; + public static final String DEFAULT_BROWSE_PATH_V2 = "␟Default"; private static final Set ENTITY_TYPES_TO_MIGRATE = ImmutableSet.of( Constants.DATASET_ENTITY_NAME, @@ -81,27 +85,14 @@ public Function executable() { private String backfillBrowsePathsV2(String entityType, AuditStamp auditStamp, String scrollId) { - // Condition: has `browsePaths` AND does NOT have `browsePathV2` - Criterion missingBrowsePathV2 = new Criterion(); - missingBrowsePathV2.setCondition(Condition.IS_NULL); - missingBrowsePathV2.setField("browsePathV2"); - // Excludes entities without browsePaths - Criterion hasBrowsePathV1 = new Criterion(); - hasBrowsePathV1.setCondition(Condition.EXISTS); - hasBrowsePathV1.setField("browsePaths"); - - CriterionArray criterionArray = new CriterionArray(); - criterionArray.add(missingBrowsePathV2); - criterionArray.add(hasBrowsePathV1); - - ConjunctiveCriterion conjunctiveCriterion = new ConjunctiveCriterion(); - conjunctiveCriterion.setAnd(criterionArray); + final Filter filter; - ConjunctiveCriterionArray conjunctiveCriterionArray = new ConjunctiveCriterionArray(); - conjunctiveCriterionArray.add(conjunctiveCriterion); - - Filter filter = new Filter(); - filter.setOr(conjunctiveCriterionArray); + if (System.getenv().containsKey(REPROCESS_DEFAULT_BROWSE_PATHS_V2) + && Boolean.parseBoolean(System.getenv(REPROCESS_DEFAULT_BROWSE_PATHS_V2))) { + filter = backfillDefaultBrowsePathsV2Filter(); + } else { + filter = backfillBrowsePathsV2Filter(); + } final ScrollResult scrollResult = _searchService.scrollAcrossEntities( ImmutableList.of(entityType), @@ -109,9 +100,9 @@ private String backfillBrowsePathsV2(String entityType, AuditStamp auditStamp, S filter, null, scrollId, - "5m", + null, BATCH_SIZE, - null + new SearchFlags().setFulltext(true).setSkipCache(true).setSkipHighlighting(true).setSkipAggregates(true) ); if (scrollResult.getNumEntities() == 0 || scrollResult.getEntities().size() == 0) { return null; @@ -129,6 +120,55 @@ private String backfillBrowsePathsV2(String entityType, AuditStamp auditStamp, S return scrollResult.getScrollId(); } + private Filter backfillBrowsePathsV2Filter() { + // Condition: has `browsePaths` AND does NOT have `browsePathV2` + Criterion missingBrowsePathV2 = new Criterion(); + missingBrowsePathV2.setCondition(Condition.IS_NULL); + missingBrowsePathV2.setField("browsePathV2"); + // Excludes entities without browsePaths + Criterion hasBrowsePathV1 = new Criterion(); + hasBrowsePathV1.setCondition(Condition.EXISTS); + hasBrowsePathV1.setField("browsePaths"); + + CriterionArray criterionArray = new CriterionArray(); + criterionArray.add(missingBrowsePathV2); + criterionArray.add(hasBrowsePathV1); + + ConjunctiveCriterion conjunctiveCriterion = new ConjunctiveCriterion(); + conjunctiveCriterion.setAnd(criterionArray); + + ConjunctiveCriterionArray conjunctiveCriterionArray = new ConjunctiveCriterionArray(); + conjunctiveCriterionArray.add(conjunctiveCriterion); + + Filter filter = new Filter(); + filter.setOr(conjunctiveCriterionArray); + return filter; + } + + private Filter backfillDefaultBrowsePathsV2Filter() { + // Condition: has default `browsePathV2` + Criterion hasDefaultBrowsePathV2 = new Criterion(); + hasDefaultBrowsePathV2.setCondition(Condition.EQUAL); + hasDefaultBrowsePathV2.setField("browsePathV2"); + StringArray values = new StringArray(); + values.add(DEFAULT_BROWSE_PATH_V2); + hasDefaultBrowsePathV2.setValues(values); + hasDefaultBrowsePathV2.setValue(DEFAULT_BROWSE_PATH_V2); // not used, but required field? + + CriterionArray criterionArray = new CriterionArray(); + criterionArray.add(hasDefaultBrowsePathV2); + + ConjunctiveCriterion conjunctiveCriterion = new ConjunctiveCriterion(); + conjunctiveCriterion.setAnd(criterionArray); + + ConjunctiveCriterionArray conjunctiveCriterionArray = new ConjunctiveCriterionArray(); + conjunctiveCriterionArray.add(conjunctiveCriterion); + + Filter filter = new Filter(); + filter.setOr(conjunctiveCriterionArray); + return filter; + } + private void ingestBrowsePathsV2(Urn urn, AuditStamp auditStamp) throws Exception { BrowsePathsV2 browsePathsV2 = _entityService.buildDefaultBrowsePathV2(urn, true); log.debug(String.format("Adding browse path v2 for urn %s with value %s", urn, browsePathsV2)); @@ -142,7 +182,7 @@ private void ingestBrowsePathsV2(Urn urn, AuditStamp auditStamp) throws Exceptio _entityService.ingestProposal( proposal, auditStamp, - false + true ); } diff --git a/datahub-web-react/codegen.yml b/datahub-web-react/codegen.yml index 96a2bd61379205..35728e8aeb7d49 100644 --- a/datahub-web-react/codegen.yml +++ b/datahub-web-react/codegen.yml @@ -20,6 +20,9 @@ generates: src/types.generated.ts: plugins: - 'typescript' + src/possibleTypes.generated.ts: + plugins: + - 'fragment-matcher' src/: preset: near-operation-file presetConfig: diff --git a/datahub-web-react/package.json b/datahub-web-react/package.json index 2d9329919fdc1b..019295f3e6ffeb 100644 --- a/datahub-web-react/package.json +++ b/datahub-web-react/package.json @@ -11,6 +11,7 @@ "@apollo/client": "^3.3.19", "@craco/craco": "^6.1.1", "@data-ui/xy-chart": "^0.0.84", + "@graphql-codegen/fragment-matcher": "^5.0.0", "@miragejs/graphql": "^0.1.11", "@monaco-editor/react": "^4.3.1", "@react-hook/window-size": "^3.0.7", diff --git a/datahub-web-react/src/App.tsx b/datahub-web-react/src/App.tsx index b6bc608dccbbb0..342a89f350429f 100644 --- a/datahub-web-react/src/App.tsx +++ b/datahub-web-react/src/App.tsx @@ -36,6 +36,7 @@ import { DataPlatformEntity } from './app/entity/dataPlatform/DataPlatformEntity import { DataProductEntity } from './app/entity/dataProduct/DataProductEntity'; import { DataPlatformInstanceEntity } from './app/entity/dataPlatformInstance/DataPlatformInstanceEntity'; import { RoleEntity } from './app/entity/Access/RoleEntity'; +import possibleTypesResult from './possibleTypes.generated'; /* Construct Apollo Client @@ -77,6 +78,8 @@ const client = new ApolloClient({ }, }, }, + // need to define possibleTypes to allow us to use Apollo cache with union types + possibleTypes: possibleTypesResult.possibleTypes, }), credentials: 'include', defaultOptions: { diff --git a/datahub-web-react/src/app/entity/EntityPage.tsx b/datahub-web-react/src/app/entity/EntityPage.tsx index 09233dbd89f694..916fa417954126 100644 --- a/datahub-web-react/src/app/entity/EntityPage.tsx +++ b/datahub-web-react/src/app/entity/EntityPage.tsx @@ -8,7 +8,6 @@ import { useEntityRegistry } from '../useEntityRegistry'; import analytics, { EventType } from '../analytics'; import { decodeUrn } from './shared/utils'; import { useGetGrantedPrivilegesQuery } from '../../graphql/policy.generated'; -import { Message } from '../shared/Message'; import { UnauthorizedPage } from '../authorization/UnauthorizedPage'; import { ErrorSection } from '../shared/error/ErrorSection'; import { VIEW_ENTITY_PAGE } from './shared/constants'; @@ -34,7 +33,7 @@ export const EntityPage = ({ entityType }: Props) => { const isLineageSupported = entity.isLineageEnabled(); const isLineageMode = useIsLineageMode(); const authenticatedUserUrn = useUserContext()?.user?.urn; - const { loading, error, data } = useGetGrantedPrivilegesQuery({ + const { error, data } = useGetGrantedPrivilegesQuery({ variables: { input: { actorUrn: authenticatedUserUrn as string, @@ -71,7 +70,6 @@ export const EntityPage = ({ entityType }: Props) => { return ( <> - {loading && } {error && } {data && !canViewEntityPage && } {canViewEntityPage && diff --git a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx index 0f1b6dbf3d660d..fc898dec9d93af 100644 --- a/datahub-web-react/src/app/entity/chart/ChartEntity.tsx +++ b/datahub-web-react/src/app/entity/chart/ChartEntity.tsx @@ -154,10 +154,12 @@ export class ChartEntity implements Entity { getOverridePropertiesFromEntity = (chart?: Chart | null): GenericEntityProperties => { // TODO: Get rid of this once we have correctly formed platform coming back. const name = chart?.properties?.name; + const subTypes = chart?.subTypes; const externalUrl = chart?.properties?.externalUrl; return { name, externalUrl, + entityTypeOverride: subTypes ? capitalizeFirstLetterOnly(subTypes.typeNames?.[0]) : '', }; }; @@ -187,6 +189,7 @@ export class ChartEntity implements Entity { return ( { type: EntityType.Chart, icon: entity?.platform?.properties?.logoUrl || undefined, platform: entity?.platform, + subtype: entity?.subTypes?.typeNames?.[0] || undefined, }; }; diff --git a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx index 7d0fc143043e29..b7fbd63ee231e3 100644 --- a/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx +++ b/datahub-web-react/src/app/entity/chart/preview/ChartPreview.tsx @@ -15,6 +15,7 @@ import { EntityPath, } from '../../../../types.generated'; import DefaultPreviewCard from '../../../preview/DefaultPreviewCard'; +import { capitalizeFirstLetterOnly } from '../../../shared/textUtil'; import { useEntityRegistry } from '../../../useEntityRegistry'; import { IconStyleType } from '../../Entity'; import { ChartStatsSummary as ChartStatsSummaryView } from '../shared/ChartStatsSummary'; @@ -43,6 +44,7 @@ export const ChartPreview = ({ snippet, degree, paths, + subType, }: { urn: string; platform?: string; @@ -67,6 +69,7 @@ export const ChartPreview = ({ snippet?: React.ReactNode | null; degree?: number; paths?: EntityPath[]; + subType?: string | null; }): JSX.Element => { const entityRegistry = useEntityRegistry(); @@ -76,7 +79,7 @@ export const ChartPreview = ({ name={name || ''} urn={urn} description={description || ''} - type="Chart" + type={capitalizeFirstLetterOnly(subType) || 'Chart'} typeIcon={entityRegistry.getIcon(EntityType.Chart, 14, IconStyleType.ACCENT)} logoUrl={logoUrl || ''} platform={platform} diff --git a/datahub-web-react/src/app/entity/dashboard/shared/DashboardStatsSummary.tsx b/datahub-web-react/src/app/entity/dashboard/shared/DashboardStatsSummary.tsx index e8fb4c16aca9c6..fb6364cffac8b6 100644 --- a/datahub-web-react/src/app/entity/dashboard/shared/DashboardStatsSummary.tsx +++ b/datahub-web-react/src/app/entity/dashboard/shared/DashboardStatsSummary.tsx @@ -11,6 +11,9 @@ import ExpandingStat from '../../dataset/shared/ExpandingStat'; const StatText = styled.span` color: ${ANTD_GRAY[8]}; + @media (min-width: 1024px) { + width: 100%; + white-space: nowrap; `; const HelpIcon = styled(QuestionCircleOutlined)` diff --git a/datahub-web-react/src/app/entity/dataset/shared/DatasetStatsSummary.tsx b/datahub-web-react/src/app/entity/dataset/shared/DatasetStatsSummary.tsx index 14f550de25be76..3dcd41a3f8a412 100644 --- a/datahub-web-react/src/app/entity/dataset/shared/DatasetStatsSummary.tsx +++ b/datahub-web-react/src/app/entity/dataset/shared/DatasetStatsSummary.tsx @@ -12,6 +12,9 @@ import ExpandingStat from './ExpandingStat'; const StatText = styled.span<{ color: string }>` color: ${(props) => props.color}; + @media (min-width: 1160px) { + width: 100%; + white-space: nowrap; `; const PopoverContent = styled.div` diff --git a/datahub-web-react/src/app/entity/dataset/shared/ExpandingStat.tsx b/datahub-web-react/src/app/entity/dataset/shared/ExpandingStat.tsx index 8101a696bf274e..4e223b6e540588 100644 --- a/datahub-web-react/src/app/entity/dataset/shared/ExpandingStat.tsx +++ b/datahub-web-react/src/app/entity/dataset/shared/ExpandingStat.tsx @@ -2,9 +2,7 @@ import React, { ReactNode, useEffect, useRef, useState } from 'react'; import styled from 'styled-components'; const ExpandingStatContainer = styled.span<{ disabled: boolean; expanded: boolean; width: string }>` - overflow: hidden; - white-space: nowrap; - width: ${(props) => props.width}; + max-width: 100%; transition: width 250ms ease; `; @@ -13,6 +11,7 @@ const ExpandingStat = ({ render, }: { disabled?: boolean; + render: (isExpanded: boolean) => ReactNode; }) => { const contentRef = useRef(null); diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx index 5384eb94429ed4..74c127cb05dd9c 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/EntityProfile.tsx @@ -4,7 +4,6 @@ import { MutationHookOptions, MutationTuple, QueryHookOptions, QueryResult } fro import styled from 'styled-components/macro'; import { useHistory } from 'react-router'; import { EntityType, Exact } from '../../../../../types.generated'; -import { Message } from '../../../../shared/Message'; import { getEntityPath, getOnboardingStepIdsForEntityType, @@ -274,7 +273,6 @@ export const EntityProfile = ({ }} > <> - {loading && } {(error && ) || (!loading && ( @@ -323,7 +321,6 @@ export const EntityProfile = ({ banner /> )} - {loading && } {(error && ) || ( {isLineageMode ? ( diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx index 97595a515b34d5..69389f5dcf6fc0 100644 --- a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeader.tsx @@ -16,6 +16,7 @@ import ShareButton from '../../../../../shared/share/ShareButton'; import { capitalizeFirstLetterOnly } from '../../../../../shared/textUtil'; import { useUserContext } from '../../../../../context/useUserContext'; import { useEntityRegistry } from '../../../../../useEntityRegistry'; +import EntityHeaderLoadingSection from './EntityHeaderLoadingSection'; const TitleWrapper = styled.div` display: flex; @@ -81,7 +82,7 @@ type Props = { }; export const EntityHeader = ({ headerDropdownItems, headerActionItems, isNameEditable, subHeader }: Props) => { - const { urn, entityType, entityData } = useEntityData(); + const { urn, entityType, entityData, loading } = useEntityData(); const refetch = useRefetch(); const me = useUserContext(); const platformName = getPlatformName(entityData); @@ -99,25 +100,32 @@ export const EntityHeader = ({ headerDropdownItems, headerActionItems, isNameEdi <> - - - - {entityData?.deprecation?.deprecated && ( - - )} - {entityData?.health && ( - ) || ( + <> + + + + {entityData?.deprecation?.deprecated && ( + + )} + {entityData?.health && ( + + )} + + - )} - - + + )} diff --git a/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeaderLoadingSection.tsx b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeaderLoadingSection.tsx new file mode 100644 index 00000000000000..bbf813804edd43 --- /dev/null +++ b/datahub-web-react/src/app/entity/shared/containers/profile/header/EntityHeaderLoadingSection.tsx @@ -0,0 +1,29 @@ +import * as React from 'react'; +import { Skeleton, Space } from 'antd'; +import styled from 'styled-components'; +import { ANTD_GRAY } from '../../../constants'; + +const ContextSkeleton = styled(Skeleton.Input)` + && { + width: 320px; + border-radius: 4px; + background-color: ${ANTD_GRAY[3]}; + } +`; + +const NameSkeleton = styled(Skeleton.Input)` + && { + width: 240px; + border-radius: 4px; + background-color: ${ANTD_GRAY[3]}; + } +`; + +export default function EntityHeaderLoadingSection() { + return ( + + + + + ); +} diff --git a/datahub-web-react/src/app/home/HomePageHeader.tsx b/datahub-web-react/src/app/home/HomePageHeader.tsx index e5c01252a865b6..c881109f6e419d 100644 --- a/datahub-web-react/src/app/home/HomePageHeader.tsx +++ b/datahub-web-react/src/app/home/HomePageHeader.tsx @@ -1,6 +1,7 @@ import React, { useEffect, useMemo, useState } from 'react'; import { useHistory } from 'react-router'; import { Typography, Image, Row, Button, Tag } from 'antd'; +import { debounce } from 'lodash'; import styled, { useTheme } from 'styled-components/macro'; import { RightOutlined } from '@ant-design/icons'; import { ManageAccount } from '../shared/ManageAccount'; @@ -24,6 +25,7 @@ import { getAutoCompleteInputFromQuickFilter } from '../search/utils/filterUtils import { useUserContext } from '../context/useUserContext'; import AcrylDemoBanner from './AcrylDemoBanner'; import DemoButton from '../entity/shared/components/styled/DemoButton'; +import { HALF_SECOND_IN_MS } from '../entity/shared/tabs/Dataset/Queries/utils/constants'; const Background = styled.div` width: 100%; @@ -176,7 +178,7 @@ export const HomePageHeader = () => { }); }; - const onAutoComplete = (query: string) => { + const onAutoComplete = debounce((query: string) => { if (query && query.trim() !== '') { getAutoCompleteResultsForMultiple({ variables: { @@ -189,7 +191,7 @@ export const HomePageHeader = () => { }, }); } - }; + }, HALF_SECOND_IN_MS); const onClickExploreAll = () => { analytics.event({ @@ -276,6 +278,7 @@ export const HomePageHeader = () => { combineSiblings showQuickFilters showViewAllResults + showCommandK /> {searchResultsToShow && searchResultsToShow.length > 0 && ( diff --git a/datahub-web-react/src/app/ingest/source/builder/constants.ts b/datahub-web-react/src/app/ingest/source/builder/constants.ts index dba8e8bb1dce6b..fdb094d721304b 100644 --- a/datahub-web-react/src/app/ingest/source/builder/constants.ts +++ b/datahub-web-react/src/app/ingest/source/builder/constants.ts @@ -29,6 +29,7 @@ import databricksLogo from '../../../../images/databrickslogo.png'; import verticaLogo from '../../../../images/verticalogo.png'; import mlflowLogo from '../../../../images/mlflowlogo.png'; import dynamodbLogo from '../../../../images/dynamodblogo.png'; +import fivetranLogo from '../../../../images/fivetranlogo.png'; export const ATHENA = 'athena'; export const ATHENA_URN = `urn:li:dataPlatform:${ATHENA}`; @@ -105,6 +106,8 @@ export const DBT_CLOUD = 'dbt-cloud'; export const DBT_CLOUD_URN = `urn:li:dataPlatform:dbt`; export const VERTICA = 'vertica'; export const VERTICA_URN = `urn:li:dataPlatform:${VERTICA}`; +export const FIVETRAN = 'fivetran'; +export const FIVETRAN_URN = `urn:li:dataPlatform:${FIVETRAN}`; export const PLATFORM_URN_TO_LOGO = { [ATHENA_URN]: athenaLogo, @@ -138,6 +141,7 @@ export const PLATFORM_URN_TO_LOGO = { [SUPERSET_URN]: supersetLogo, [UNITY_CATALOG_URN]: databricksLogo, [VERTICA_URN]: verticaLogo, + [FIVETRAN_URN]: fivetranLogo, }; export const SOURCE_TO_PLATFORM_URN = { diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index b18384909c33f0..9619abebbd54e6 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -216,6 +216,13 @@ "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/vertica/", "recipe": "source:\n type: vertica\n config:\n # Coordinates\n host_port: localhost:5433\n # The name of the vertica database\n database: Database_Name\n # Credentials\n username: Vertica_User\n password: Vertica_Password\n\n include_tables: true\n include_views: true\n include_projections: true\n include_models: true\n include_view_lineage: true\n include_projection_lineage: true\n profiling:\n enabled: false\n stateful_ingestion:\n enabled: true " }, + { + "urn": "urn:li:dataPlatform:fivetran", + "name": "fivetran", + "displayName": "Fivetran", + "docsUrl": "https://datahubproject.io/docs/generated/ingestion/sources/fivetran/", + "recipe": "source:\n type: fivetran\n config:\n # Fivetran log connector destination server configurations\n fivetran_log_config:\n destination_platform: snowflake\n destination_config:\n # Coordinates\n account_id: snowflake_account_id\n warehouse: warehouse_name\n database: snowflake_db\n log_schema: fivetran_log_schema\n\n # Credentials\n username: ${SNOWFLAKE_USER}\n password: ${SNOWFLAKE_PASS}\n role: snowflake_role\n\n # Optional - filter for certain connector names instead of ingesting everything.\n # connector_patterns:\n # allow:\n # - connector_name\n\n # Optional -- This mapping is optional and only required to configure platform-instance for source\n # A mapping of Fivetran connector id to data platform instance\n # sources_to_platform_instance:\n # calendar_elected:\n # platform_instance: cloud_postgres_instance\n # env: DEV\n\n # Optional -- This mapping is optional and only required to configure platform-instance for destination.\n # A mapping of Fivetran destination id to data platform instance\n # destination_to_platform_instance:\n # calendar_elected:\n # platform_instance: cloud_postgres_instance\n # env: DEV" + }, { "urn": "urn:li:dataPlatform:custom", "name": "custom", diff --git a/datahub-web-react/src/app/lineage/LineageExplorer.tsx b/datahub-web-react/src/app/lineage/LineageExplorer.tsx index ed0b26bde11efa..28cd7025f51f45 100644 --- a/datahub-web-react/src/app/lineage/LineageExplorer.tsx +++ b/datahub-web-react/src/app/lineage/LineageExplorer.tsx @@ -3,7 +3,6 @@ import { useHistory } from 'react-router'; import { Button, Drawer } from 'antd'; import { InfoCircleOutlined } from '@ant-design/icons'; import styled from 'styled-components'; -import { Message } from '../shared/Message'; import { useEntityRegistry } from '../useEntityRegistry'; import CompactContext from '../shared/CompactContext'; import { EntityAndType, EntitySelectParams, FetchedEntities } from './types'; @@ -18,12 +17,10 @@ import { ErrorSection } from '../shared/error/ErrorSection'; import usePrevious from '../shared/usePrevious'; import { useGetLineageTimeParams } from './utils/useGetLineageTimeParams'; import analytics, { EventType } from '../analytics'; +import LineageLoadingSection from './LineageLoadingSection'; const DEFAULT_DISTANCE_FROM_TOP = 106; -const LoadingMessage = styled(Message)` - margin-top: 10%; -`; const FooterButtonGroup = styled.div` display: flex; justify-content: space-between; @@ -167,7 +164,7 @@ export default function LineageExplorer({ urn, type }: Props) { return ( <> {error && } - {loading && } + {loading && } {!!data && (
) diff --git a/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx b/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx new file mode 100644 index 00000000000000..9d84de0c211729 --- /dev/null +++ b/datahub-web-react/src/app/lineage/LineageLoadingSection.tsx @@ -0,0 +1,27 @@ +import * as React from 'react'; +import styled from 'styled-components'; +import { LoadingOutlined } from '@ant-design/icons'; +import { ANTD_GRAY } from '../entity/shared/constants'; + +const Container = styled.div` + height: auto; + width: 100%; + display: flex; + justify-content: center; + align-items: center; + background-color: rgb(250, 250, 250); +`; + +const StyledLoading = styled(LoadingOutlined)` + font-size: 36px; + color: ${ANTD_GRAY[7]}; + padding-bottom: 18px; +]`; + +export default function LineageLoadingSection() { + return ( + + + + ); +} diff --git a/datahub-web-react/src/app/lineage/utils/layoutTree.ts b/datahub-web-react/src/app/lineage/utils/layoutTree.ts index cc704007049c20..a972a62308f073 100644 --- a/datahub-web-react/src/app/lineage/utils/layoutTree.ts +++ b/datahub-web-react/src/app/lineage/utils/layoutTree.ts @@ -32,6 +32,21 @@ function getParentRelationship(direction: Direction, parent: VizNode | null, nod return directionRelationships?.find((r) => r?.entity?.urn === node?.urn); } +// this utility function is to help make sure layouts that contain many references to the same URN don't struggle laying out that URN. +function firstAppearanceIndices(arr) { + const seen = new Set(); // To track which strings have been seen + const result = [] as number[]; + + for (let i = 0; i < arr.length; i++) { + if (!seen.has(arr[i])) { + seen.add(arr[i]); // Add the string to the set + result.push(i); // Save the index + } + } + + return result; +} + function layoutNodesForOneDirection( data: NodeData, direction: Direction, @@ -54,12 +69,10 @@ function layoutNodesForOneDirection( while (nodesInCurrentLayer.length > 0) { // if we've already added a node to the viz higher up dont add it again const urnsToAddInCurrentLayer = Array.from(new Set(nodesInCurrentLayer.map(({ node }) => node.urn || ''))); - const nodesToAddInCurrentLayer = urnsToAddInCurrentLayer - .filter((urn, pos) => urnsToAddInCurrentLayer.indexOf(urn) === pos) - .filter((urn) => !nodesByUrn[urn || '']); + const positionsToAddInCurrentLayer = firstAppearanceIndices(urnsToAddInCurrentLayer); const filteredNodesInCurrentLayer = nodesInCurrentLayer - .filter(({ node }) => nodesToAddInCurrentLayer.indexOf(node.urn || '') > -1) + .filter((_, idx) => positionsToAddInCurrentLayer.indexOf(idx) > -1) .filter(({ node }) => node.status?.removed !== true); const layerSize = filteredNodesInCurrentLayer.length; diff --git a/datahub-web-react/src/app/onboarding/config/LineageGraphOnboardingConfig.tsx b/datahub-web-react/src/app/onboarding/config/LineageGraphOnboardingConfig.tsx index 54bae6978a4a97..89a01ab3bd241c 100644 --- a/datahub-web-react/src/app/onboarding/config/LineageGraphOnboardingConfig.tsx +++ b/datahub-web-react/src/app/onboarding/config/LineageGraphOnboardingConfig.tsx @@ -23,7 +23,7 @@ export const LineageGraphOnboardingConfig: OnboardingStep[] = [ here. diff --git a/datahub-web-react/src/app/search/CommandK.tsx b/datahub-web-react/src/app/search/CommandK.tsx new file mode 100644 index 00000000000000..13e55a0e3f2661 --- /dev/null +++ b/datahub-web-react/src/app/search/CommandK.tsx @@ -0,0 +1,29 @@ +import React from 'react'; +import styled from 'styled-components'; +import { ANTD_GRAY } from '../entity/shared/constants'; + +const Container = styled.div` + color: ${ANTD_GRAY[6]}; + background-color: #ffffff; + opacity: 0.9; + border-color: black; + border-radius: 6px; + border: 1px solid ${ANTD_GRAY[6]}; + padding-right: 6px; + padding-left: 6px; + margin-right: 4px; + margin-left: 4px; +`; + +const Letter = styled.span` + padding: 2px; +`; + +export const CommandK = () => { + return ( + + + K + + ); +}; diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx index 5f797e68fe0e8b..a23ead83caf541 100644 --- a/datahub-web-react/src/app/search/SearchBar.tsx +++ b/datahub-web-react/src/app/search/SearchBar.tsx @@ -23,6 +23,7 @@ import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; import ViewAllSearchItem from './ViewAllSearchItem'; import { ViewSelect } from '../entity/view/select/ViewSelect'; import { combineSiblingsInAutoComplete } from './utils/combineSiblingsInAutoComplete'; +import { CommandK } from './CommandK'; const StyledAutoComplete = styled(AutoComplete)` width: 100%; @@ -114,6 +115,7 @@ interface Props { fixAutoComplete?: boolean; hideRecommendations?: boolean; showQuickFilters?: boolean; + showCommandK?: boolean; viewsEnabled?: boolean; combineSiblings?: boolean; setIsSearchBarFocused?: (isSearchBarFocused: boolean) => void; @@ -142,6 +144,7 @@ export const SearchBar = ({ fixAutoComplete, hideRecommendations, showQuickFilters, + showCommandK = false, viewsEnabled = false, combineSiblings = false, setIsSearchBarFocused, @@ -153,6 +156,8 @@ export const SearchBar = ({ const [searchQuery, setSearchQuery] = useState(initialQuery); const [selected, setSelected] = useState(); const [isDropdownVisible, setIsDropdownVisible] = useState(false); + const [isFocused, setIsFocused] = useState(false); + useEffect(() => setSelected(initialQuery), [initialQuery]); const searchEntityTypes = entityRegistry.getSearchEntityTypes(); @@ -277,11 +282,13 @@ export const SearchBar = ({ function handleFocus() { if (onFocus) onFocus(); handleSearchBarClick(true); + setIsFocused(true); } function handleBlur() { if (onBlur) onBlur(); handleSearchBarClick(false); + setIsFocused(false); } function handleSearch(query: string, type?: EntityType, appliedQuickFilters?: FacetFilterInput[]) { @@ -294,18 +301,21 @@ export const SearchBar = ({ const searchInputRef = useRef(null); useEffect(() => { - const handleKeyDown = (event) => { - // Support command-k to select the search bar. - // 75 is the keyCode for 'k' - if ((event.metaKey || event.ctrlKey) && event.keyCode === 75) { - (searchInputRef?.current as any)?.focus(); - } - }; - document.addEventListener('keydown', handleKeyDown); - return () => { - document.removeEventListener('keydown', handleKeyDown); - }; - }, []); + if (showCommandK) { + const handleKeyDown = (event) => { + // Support command-k to select the search bar. + // 75 is the keyCode for 'k' + if ((event.metaKey || event.ctrlKey) && event.keyCode === 75) { + (searchInputRef?.current as any)?.focus(); + } + }; + document.addEventListener('keydown', handleKeyDown); + return () => { + document.removeEventListener('keydown', handleKeyDown); + }; + } + return () => null; + }, [showCommandK]); return ( @@ -377,7 +387,7 @@ export const SearchBar = ({ data-testid="search-input" onFocus={handleFocus} onBlur={handleBlur} - allowClear={{ clearIcon: }} + allowClear={(isFocused && { clearIcon: }) || false} prefix={ <> {viewsEnabled && ( @@ -411,6 +421,7 @@ export const SearchBar = ({ } ref={searchInputRef} + suffix={(showCommandK && !isFocused && ) || null} /> diff --git a/datahub-web-react/src/app/search/SearchHeader.tsx b/datahub-web-react/src/app/search/SearchHeader.tsx index 91f9753a3d6012..76e78a11d3e9d9 100644 --- a/datahub-web-react/src/app/search/SearchHeader.tsx +++ b/datahub-web-react/src/app/search/SearchHeader.tsx @@ -108,6 +108,7 @@ export const SearchHeader = ({ fixAutoComplete showQuickFilters showViewAllResults + showCommandK /> diff --git a/datahub-web-react/src/app/search/SearchablePage.tsx b/datahub-web-react/src/app/search/SearchablePage.tsx index 489687050c749d..9d02d85d3634c0 100644 --- a/datahub-web-react/src/app/search/SearchablePage.tsx +++ b/datahub-web-react/src/app/search/SearchablePage.tsx @@ -1,5 +1,6 @@ import React, { useEffect, useState } from 'react'; import { useHistory, useLocation } from 'react-router'; +import { debounce } from 'lodash'; import * as QueryString from 'query-string'; import { useTheme } from 'styled-components'; import { SearchHeader } from './SearchHeader'; @@ -17,6 +18,7 @@ import { getAutoCompleteInputFromQuickFilter } from './utils/filterUtils'; import { useQuickFiltersContext } from '../../providers/QuickFiltersContext'; import { useUserContext } from '../context/useUserContext'; import { useSelectedSortOption } from './context/SearchContext'; +import { HALF_SECOND_IN_MS } from '../entity/shared/tabs/Dataset/Queries/utils/constants'; const styles = { children: { @@ -93,7 +95,7 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) => }); }; - const autoComplete = (query: string) => { + const autoComplete = debounce((query: string) => { if (query && query.trim() !== '') { getAutoCompleteResults({ variables: { @@ -105,7 +107,7 @@ export const SearchablePage = ({ onSearch, onAutoComplete, children }: Props) => }, }); } - }; + }, HALF_SECOND_IN_MS); // Load correct autocomplete results on initial page load. useEffect(() => { diff --git a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx index ce1ad93565ba43..4a7a4938ea9709 100644 --- a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx +++ b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx @@ -73,7 +73,6 @@ export function HeaderLinks(props: Props) { const showSettings = true; const showIngestion = isIngestionEnabled && me && me.platformPrivileges?.manageIngestion && me.platformPrivileges?.manageSecrets; - const showDomains = me?.platformPrivileges?.createDomains || me?.platformPrivileges?.manageDomains; useUpdateEducationStepIdsAllowlist(!!showIngestion, HOME_PAGE_INGESTION_ID); @@ -106,22 +105,20 @@ export function HeaderLinks(props: Props) { View and modify your data dictionary - {showDomains && ( - - - - - Domains - - Manage related groups of data assets - - - )} + + + + + Domains + + Manage related groups of data assets + + } > diff --git a/datahub-web-react/src/graphql/chart.graphql b/datahub-web-react/src/graphql/chart.graphql index d4d3c3c9184082..a4b430720fa3d5 100644 --- a/datahub-web-react/src/graphql/chart.graphql +++ b/datahub-web-react/src/graphql/chart.graphql @@ -100,6 +100,9 @@ query getChart($urn: String!) { canEditLineage canEditEmbed } + subTypes { + typeNames + } } } diff --git a/datahub-web-react/src/graphql/lineage.graphql b/datahub-web-react/src/graphql/lineage.graphql index 52385dee8631ac..8fdfb696e08943 100644 --- a/datahub-web-react/src/graphql/lineage.graphql +++ b/datahub-web-react/src/graphql/lineage.graphql @@ -165,6 +165,9 @@ fragment lineageNodeProperties on EntityWithRelationships { status { removed } + subTypes { + typeNames + } } ... on Dataset { name diff --git a/datahub-web-react/src/graphql/search.graphql b/datahub-web-react/src/graphql/search.graphql index 2297c2d0c1d075..876be12fd335b7 100644 --- a/datahub-web-react/src/graphql/search.graphql +++ b/datahub-web-react/src/graphql/search.graphql @@ -105,6 +105,9 @@ fragment autoCompleteFields on Entity { parentContainers { ...parentContainersFields } + subTypes { + typeNames + } } ... on DataFlow { orchestrator @@ -550,6 +553,9 @@ fragment searchResultFields on Entity { } } } + subTypes { + typeNames + } } ... on DataFlow { flowId diff --git a/datahub-web-react/src/images/fivetranlogo.png b/datahub-web-react/src/images/fivetranlogo.png new file mode 100644 index 00000000000000..d5c999ad2d86e9 Binary files /dev/null and b/datahub-web-react/src/images/fivetranlogo.png differ diff --git a/datahub-web-react/yarn.lock b/datahub-web-react/yarn.lock index 590f3ebcef8c33..ce0f2f514dad1e 100644 --- a/datahub-web-react/yarn.lock +++ b/datahub-web-react/yarn.lock @@ -2298,6 +2298,14 @@ "@graphql-tools/utils" "^6" tslib "~2.0.1" +"@graphql-codegen/fragment-matcher@^5.0.0": + version "5.0.0" + resolved "https://registry.yarnpkg.com/@graphql-codegen/fragment-matcher/-/fragment-matcher-5.0.0.tgz#2a016715e42e8f21aa08830f34a4d0a930e660fe" + integrity sha512-mbash9E8eY6RSMSNrrO+C9JJEn8rdr8ORaxMpgdWL2qe2q/TlLUCE3ZvQvHkSc7GjBnMEk36LncA8ApwHR2BHg== + dependencies: + "@graphql-codegen/plugin-helpers" "^5.0.0" + tslib "~2.5.0" + "@graphql-codegen/near-operation-file-preset@^1.17.13": version "1.18.6" resolved "https://registry.yarnpkg.com/@graphql-codegen/near-operation-file-preset/-/near-operation-file-preset-1.18.6.tgz#2378ac75feaeaa1cfd2146bd84bf839b1fe20d9d" @@ -2331,6 +2339,18 @@ lodash "~4.17.0" tslib "~2.3.0" +"@graphql-codegen/plugin-helpers@^5.0.0": + version "5.0.1" + resolved "https://registry.yarnpkg.com/@graphql-codegen/plugin-helpers/-/plugin-helpers-5.0.1.tgz#e2429fcfba3f078d5aa18aa062d46c922bbb0d55" + integrity sha512-6L5sb9D8wptZhnhLLBcheSPU7Tg//DGWgc5tQBWX46KYTOTQHGqDpv50FxAJJOyFVJrveN9otWk9UT9/yfY4ww== + dependencies: + "@graphql-tools/utils" "^10.0.0" + change-case-all "1.0.15" + common-tags "1.8.2" + import-from "4.0.0" + lodash "~4.17.0" + tslib "~2.5.0" + "@graphql-codegen/typescript-operations@1.17.13": version "1.17.13" resolved "https://registry.yarnpkg.com/@graphql-codegen/typescript-operations/-/typescript-operations-1.17.13.tgz#a5b08c1573b9507ca5a9e66e795aecc40ddc5305" @@ -2584,6 +2604,16 @@ dependencies: tslib "^2.4.0" +"@graphql-tools/utils@^10.0.0": + version "10.0.8" + resolved "https://registry.yarnpkg.com/@graphql-tools/utils/-/utils-10.0.8.tgz#c7b84275ec83dc42ad9f3d4ffc424ff682075759" + integrity sha512-yjyA8ycSa1WRlJqyX/aLqXeE5DvF/H02+zXMUFnCzIDrj0UvLMUrxhmVFnMK0Q2n3bh4uuTeY3621m5za9ovXw== + dependencies: + "@graphql-typed-document-node/core" "^3.1.1" + cross-inspect "1.0.0" + dset "^3.1.2" + tslib "^2.4.0" + "@graphql-tools/utils@^6": version "6.2.4" resolved "https://registry.yarnpkg.com/@graphql-tools/utils/-/utils-6.2.4.tgz#38a2314d2e5e229ad4f78cca44e1199e18d55856" @@ -2618,6 +2648,11 @@ resolved "https://registry.yarnpkg.com/@graphql-typed-document-node/core/-/core-3.1.0.tgz#0eee6373e11418bfe0b5638f654df7a4ca6a3950" integrity sha512-wYn6r8zVZyQJ6rQaALBEln5B1pzxb9shV5Ef97kTvn6yVGrqyXVnDqnU24MXnFubR+rZjBY9NWuxX3FB2sTsjg== +"@graphql-typed-document-node/core@^3.1.1": + version "3.2.0" + resolved "https://registry.yarnpkg.com/@graphql-typed-document-node/core/-/core-3.2.0.tgz#5f3d96ec6b2354ad6d8a28bf216a1d97b5426861" + integrity sha512-mB9oAsNCm9aM3/SOv4YtBMqZbYj10R7dkq8byBqxGY/ncFwhf2oQzMV+LCRlWoDSEBJ3COiR1yeDvMtsoOsuFQ== + "@hapi/address@2.x.x": version "2.1.4" resolved "https://registry.yarnpkg.com/@hapi/address/-/address-2.1.4.tgz#5d67ed43f3fd41a69d4b9ff7b56e7c0d1d0a81e5" @@ -7001,6 +7036,22 @@ change-case-all@1.0.14: upper-case "^2.0.2" upper-case-first "^2.0.2" +change-case-all@1.0.15: + version "1.0.15" + resolved "https://registry.yarnpkg.com/change-case-all/-/change-case-all-1.0.15.tgz#de29393167fc101d646cd76b0ef23e27d09756ad" + integrity sha512-3+GIFhk3sNuvFAJKU46o26OdzudQlPNBCu1ZQi3cMeMHhty1bhDxu2WrEilVNYaGvqUtR1VSigFcJOiS13dRhQ== + dependencies: + change-case "^4.1.2" + is-lower-case "^2.0.2" + is-upper-case "^2.0.2" + lower-case "^2.0.2" + lower-case-first "^2.0.2" + sponge-case "^1.0.1" + swap-case "^2.0.2" + title-case "^3.0.3" + upper-case "^2.0.2" + upper-case-first "^2.0.2" + change-case@^4.1.2: version "4.1.2" resolved "https://registry.yarnpkg.com/change-case/-/change-case-4.1.2.tgz#fedfc5f136045e2398c0410ee441f95704641e12" @@ -7357,6 +7408,11 @@ common-tags@1.8.0, common-tags@^1.8.0: resolved "https://registry.yarnpkg.com/common-tags/-/common-tags-1.8.0.tgz#8e3153e542d4a39e9b10554434afaaf98956a937" integrity sha512-6P6g0uetGpW/sdyUy/iQQCbFF0kWVMSIVSyYz7Zgjcgh8mgw8PQzDNZeyZ5DQ2gM7LBoZPHmnjz8rUthkBG5tw== +common-tags@1.8.2: + version "1.8.2" + resolved "https://registry.yarnpkg.com/common-tags/-/common-tags-1.8.2.tgz#94ebb3c076d26032745fd54face7f688ef5ac9c6" + integrity sha512-gk/Z852D2Wtb//0I+kRFNKKE9dIIVirjoqPoA1wJU+XePVXZfGeBpk45+A1rKO4Q43prqWBNY/MiIeRLbPWUaA== + commondir@^1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/commondir/-/commondir-1.0.1.tgz#ddd800da0c66127393cca5950ea968a3aaf1253b" @@ -7698,6 +7754,13 @@ cross-fetch@^3.1.5: dependencies: node-fetch "2.6.7" +cross-inspect@1.0.0: + version "1.0.0" + resolved "https://registry.yarnpkg.com/cross-inspect/-/cross-inspect-1.0.0.tgz#5fda1af759a148594d2d58394a9e21364f6849af" + integrity sha512-4PFfn4b5ZN6FMNGSZlyb7wUhuN8wvj8t/VQHZdM4JsDcruGJ8L2kf9zao98QIrBPFCpdk27qst/AGTl7pL3ypQ== + dependencies: + tslib "^2.4.0" + cross-spawn@7.0.3, cross-spawn@^7.0.0, cross-spawn@^7.0.2, cross-spawn@^7.0.3: version "7.0.3" resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.3.tgz#f73a85b9d5d41d045551c177e2882d4ac85728a6" @@ -8595,6 +8658,11 @@ dotenv@^8.2.0: resolved "https://registry.yarnpkg.com/dotenv/-/dotenv-8.6.0.tgz#061af664d19f7f4d8fc6e4ff9b584ce237adcb8b" integrity sha512-IrPdXQsk2BbzvCBGBOTmmSH5SodmqZNt4ERAZDmW4CT+tL8VtvinqywuANaFu4bOMWki16nqf0e4oC0QIaDr/g== +dset@^3.1.2: + version "3.1.3" + resolved "https://registry.yarnpkg.com/dset/-/dset-3.1.3.tgz#c194147f159841148e8e34ca41f638556d9542d2" + integrity sha512-20TuZZHCEZ2O71q9/+8BwKwZ0QtD9D8ObhrihJPr+vLLYlSuAU3/zL4cSlgbfeoGHTjCSJBa7NGcrF9/Bx/WJQ== + duplexer3@^0.1.4: version "0.1.4" resolved "https://registry.yarnpkg.com/duplexer3/-/duplexer3-0.1.4.tgz#ee01dd1cac0ed3cbc7fdbea37dc0a8f1ce002ce2" @@ -18712,6 +18780,11 @@ tslib@~2.3.0: resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.3.1.tgz#e8a335add5ceae51aa261d32a490158ef042ef01" integrity sha512-77EbyPPpMz+FRFRuAFlWMtmgUWGe9UOG2Z25NqCwiIjRhOf5iKGuzSe5P2w1laq+FkRy4p+PCuVkJSGkzTEKVw== +tslib@~2.5.0: + version "2.5.3" + resolved "https://registry.yarnpkg.com/tslib/-/tslib-2.5.3.tgz#24944ba2d990940e6e982c4bea147aba80209913" + integrity sha512-mSxlJJwl3BMEQCUNnxXBU9jP4JBktcEGhURcPR6VQVlnP0FdDEsIaz0C35dXNGLyRfrATNofF0F5p2KPxQgB+w== + tsutils@^3.17.1: version "3.21.0" resolved "https://registry.yarnpkg.com/tsutils/-/tsutils-3.21.0.tgz#b48717d394cea6c1e096983eed58e9d61715b623" diff --git a/docker/datahub-upgrade/env/docker-without-neo4j.env b/docker/datahub-upgrade/env/docker-without-neo4j.env index c399f71b7b15c8..04d888f076cd68 100644 --- a/docker/datahub-upgrade/env/docker-without-neo4j.env +++ b/docker/datahub-upgrade/env/docker-without-neo4j.env @@ -21,6 +21,7 @@ DATAHUB_GMS_PORT=8080 ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml BACKFILL_BROWSE_PATHS_V2=true +REPROCESS_DEFAULT_BROWSE_PATHS_V2=${REPROCESS_DEFAULT_BROWSE_PATHS_V2:-false} # Uncomment and set these to support SSL connection to Elasticsearch # ELASTICSEARCH_USE_SSL= diff --git a/docker/datahub-upgrade/env/docker.env b/docker/datahub-upgrade/env/docker.env index 491470406153b2..b2a0d01e5d4ae8 100644 --- a/docker/datahub-upgrade/env/docker.env +++ b/docker/datahub-upgrade/env/docker.env @@ -25,6 +25,7 @@ DATAHUB_GMS_PORT=8080 ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml BACKFILL_BROWSE_PATHS_V2=true +REPROCESS_DEFAULT_BROWSE_PATHS_V2=${REPROCESS_DEFAULT_BROWSE_PATHS_V2:-false} # Uncomment and set these to support SSL connection to Elasticsearch # ELASTICSEARCH_USE_SSL= diff --git a/docker/docker-compose.dev.yml b/docker/docker-compose.dev.yml index c4e5ee7fa0cae9..774c4e17bee21f 100644 --- a/docker/docker-compose.dev.yml +++ b/docker/docker-compose.dev.yml @@ -54,6 +54,8 @@ services: - ${HOME}/.datahub/plugins:/etc/datahub/plugins datahub-upgrade: image: acryldata/datahub-upgrade:debug + ports: + - ${DATAHUB_MAPPED_UPGRADE_DEBUG_PORT:-5003}:5003 build: context: datahub-upgrade dockerfile: Dockerfile @@ -63,6 +65,8 @@ services: - SKIP_ELASTICSEARCH_CHECK=false - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-dev} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} + - REPROCESS_DEFAULT_BROWSE_PATHS_V2=${REPROCESS_DEFAULT_BROWSE_PATHS_V2:-false} + - JAVA_TOOL_OPTIONS=-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5003 volumes: - ../datahub-upgrade/build/libs/:/datahub/datahub-upgrade/bin/ - ../metadata-models/src/main/resources/:/datahub/datahub-gms/resources diff --git a/docker/docker-compose.override.yml b/docker/docker-compose.override.yml index 225aa01fa4e4f8..0907f47d70c3ce 100644 --- a/docker/docker-compose.override.yml +++ b/docker/docker-compose.override.yml @@ -7,8 +7,12 @@ services: environment: - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} volumes: - ${HOME}/.datahub/plugins:/etc/datahub/plugins + datahub-upgrade: + environment: + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} mysql-setup: container_name: mysql-setup hostname: mysql-setup diff --git a/docker/elasticsearch-setup/create-indices.sh b/docker/elasticsearch-setup/create-indices.sh index 343013402394f8..5c4eb3ce3851ef 100755 --- a/docker/elasticsearch-setup/create-indices.sh +++ b/docker/elasticsearch-setup/create-indices.sh @@ -129,7 +129,7 @@ function create_datahub_usage_event_aws_elasticsearch() { if [ $USAGE_EVENT_STATUS -eq 200 ]; then USAGE_EVENT_DEFINITION=$(curl "${CURL_ARGS[@]}" "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event") # the definition is expected to contain "datahub_usage_event-000001" string - if [[ $USAGE_EVENT_DEFINITION != *"datahub_usage_event-$INDEX_SUFFIX"* ]]; then + if [[ $USAGE_EVENT_DEFINITION != *"datahub_usage_event-"* ]]; then # ... if it doesn't, we need to drop it echo -e "\n>>> deleting invalid datahub_usage_event ..." curl "${CURL_ARGS[@]}" -XDELETE "$ELASTICSEARCH_URL/${PREFIX}datahub_usage_event" diff --git a/docker/mysql-setup/init.sql b/docker/mysql-setup/init.sql index b789329ddfd179..b6a1d47fb2a022 100644 --- a/docker/mysql-setup/init.sql +++ b/docker/mysql-setup/init.sql @@ -1,6 +1,6 @@ -- create datahub database -CREATE DATABASE IF NOT EXISTS DATAHUB_DB_NAME CHARACTER SET utf8mb4 COLLATE utf8mb4_bin; -USE DATAHUB_DB_NAME; +CREATE DATABASE IF NOT EXISTS `DATAHUB_DB_NAME` CHARACTER SET utf8mb4 COLLATE utf8mb4_bin; +USE `DATAHUB_DB_NAME`; -- create metadata aspect table create table if not exists metadata_aspect_v2 ( diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 8b7d01c77191a5..4df32395cf82d5 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -81,32 +81,32 @@ services: - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} - DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-gms - - EBEAN_DATASOURCE_USERNAME=datahub - - EBEAN_DATASOURCE_PASSWORD=datahub + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8&enabledTLSProtocols=TLSv1.2 - - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - - ELASTICSEARCH_PORT=9200 - - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - - NEO4J_HOST=http://neo4j:7474 - - NEO4J_URI=bolt://neo4j - - NEO4J_USERNAME=neo4j - - NEO4J_PASSWORD=datahub - - JAVA_OPTS=-Xms1g -Xmx1g - - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - - GRAPH_SERVICE_IMPL=neo4j + - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true + - ELASTICSEARCH_PORT=9200 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true + - ES_BULK_REFRESH_POLICY=WAIT_UNTIL + - GRAPH_SERVICE_DIFF_MODE_ENABLED=true + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} + - JAVA_OPTS=-Xms1g -Xmx1g + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=true - MCE_CONSUMER_ENABLED=true + - METADATA_SERVICE_AUTH_ENABLED=false + - NEO4J_HOST=http://neo4j:7474 + - NEO4J_PASSWORD=datahub + - NEO4J_URI=bolt://neo4j + - NEO4J_USERNAME=neo4j - PE_CONSUMER_ENABLED=true - UI_INGESTION_ENABLED=true - - METADATA_SERVICE_AUTH_ENABLED=false healthcheck: interval: 1s retries: 3 @@ -134,23 +134,24 @@ services: neo4j: condition: service_healthy environment: - - EBEAN_DATASOURCE_USERNAME=datahub - - EBEAN_DATASOURCE_PASSWORD=datahub + - BACKFILL_BROWSE_PATHS_V2=true + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 - - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - EBEAN_DATASOURCE_USERNAME=datahub + - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false - ELASTICSEARCH_HOST=elasticsearch - - ELASTICSEARCH_PORT=9200 - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false - - GRAPH_SERVICE_IMPL=elasticsearch - - DATAHUB_GMS_HOST=datahub-gms - - DATAHUB_GMS_PORT=8080 + - ELASTICSEARCH_PORT=9200 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - - BACKFILL_BROWSE_PATHS_V2=true + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false hostname: datahub-upgrade image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} labels: diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index e45bafc3da480e..b1cb6c208a42d6 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -144,6 +144,7 @@ services: - DATAHUB_GMS_PORT=8080 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - BACKFILL_BROWSE_PATHS_V2=true + - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false hostname: datahub-upgrade image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} labels: diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 020ef5e9a97b96..ab5182bf98ae50 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -144,6 +144,7 @@ services: - DATAHUB_GMS_PORT=8080 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - BACKFILL_BROWSE_PATHS_V2=true + - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false hostname: datahub-upgrade image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} labels: diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index f293b3e7a6b753..29c980532d46f6 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -81,32 +81,32 @@ services: - DATAHUB_SERVER_TYPE=${DATAHUB_SERVER_TYPE:-quickstart} - DATAHUB_TELEMETRY_ENABLED=${DATAHUB_TELEMETRY_ENABLED:-true} - DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-gms - - EBEAN_DATASOURCE_USERNAME=datahub - - EBEAN_DATASOURCE_PASSWORD=datahub + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8&enabledTLSProtocols=TLSv1.2 - - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - EBEAN_DATASOURCE_USERNAME=datahub - ELASTICSEARCH_HOST=elasticsearch - - ELASTICSEARCH_PORT=9200 - - ES_BULK_REFRESH_POLICY=WAIT_UNTIL - - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - - NEO4J_HOST=http://neo4j:7474 - - NEO4J_URI=bolt://neo4j - - NEO4J_USERNAME=neo4j - - NEO4J_PASSWORD=datahub - - JAVA_OPTS=-Xms1g -Xmx1g - - GRAPH_SERVICE_DIFF_MODE_ENABLED=true - - GRAPH_SERVICE_IMPL=neo4j + - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true + - ELASTICSEARCH_PORT=9200 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - ENTITY_SERVICE_ENABLE_RETENTION=true + - ES_BULK_REFRESH_POLICY=WAIT_UNTIL + - GRAPH_SERVICE_DIFF_MODE_ENABLED=true + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} + - JAVA_OPTS=-Xms1g -Xmx1g + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 - MAE_CONSUMER_ENABLED=true - MCE_CONSUMER_ENABLED=true + - METADATA_SERVICE_AUTH_ENABLED=false + - NEO4J_HOST=http://neo4j:7474 + - NEO4J_PASSWORD=datahub + - NEO4J_URI=bolt://neo4j + - NEO4J_USERNAME=neo4j - PE_CONSUMER_ENABLED=true - UI_INGESTION_ENABLED=true - - METADATA_SERVICE_AUTH_ENABLED=false healthcheck: interval: 1s retries: 3 @@ -134,23 +134,24 @@ services: neo4j: condition: service_healthy environment: - - EBEAN_DATASOURCE_USERNAME=datahub - - EBEAN_DATASOURCE_PASSWORD=datahub + - BACKFILL_BROWSE_PATHS_V2=true + - DATAHUB_GMS_HOST=datahub-gms + - DATAHUB_GMS_PORT=8080 + - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - EBEAN_DATASOURCE_HOST=mysql:3306 + - EBEAN_DATASOURCE_PASSWORD=datahub - EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8 - - EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver - - KAFKA_BOOTSTRAP_SERVER=broker:29092 - - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - EBEAN_DATASOURCE_USERNAME=datahub + - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false - ELASTICSEARCH_HOST=elasticsearch - - ELASTICSEARCH_PORT=9200 - ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true - ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true - - ELASTICSEARCH_BUILD_INDICES_CLONE_INDICES=false - - GRAPH_SERVICE_IMPL=elasticsearch - - DATAHUB_GMS_HOST=datahub-gms - - DATAHUB_GMS_PORT=8080 + - ELASTICSEARCH_PORT=9200 - ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml - - BACKFILL_BROWSE_PATHS_V2=true + - GRAPH_SERVICE_IMPL=${GRAPH_SERVICE_IMPL:-elasticsearch} + - KAFKA_BOOTSTRAP_SERVER=broker:29092 + - KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081 + - REPROCESS_DEFAULT_BROWSE_PATHS_V2=false hostname: datahub-upgrade image: ${DATAHUB_UPGRADE_IMAGE:-acryldata/datahub-upgrade}:${DATAHUB_VERSION:-head} labels: diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md index a8958985a0a724..4c97cadc88417e 100644 --- a/docs/modeling/metadata-model.md +++ b/docs/modeling/metadata-model.md @@ -625,7 +625,7 @@ curl --location --request POST 'http://localhost:8080/analytics?action=getTimese } } ``` -For more examples on the complex types of group-by/aggregations, refer to the tests in the group `getAggregatedStats` of [ElasticSearchTimeseriesAspectServiceTest.java](https://github.com/datahub-project/datahub/blob/master/metadata-io/src/test/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectServiceTest.java). +For more examples on the complex types of group-by/aggregations, refer to the tests in the group `getAggregatedStats` of [TimeseriesAspectServiceTestBase.java](https://github.com/datahub-project/datahub/blob/master/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java). diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authentication/AuthenticationRequest.java b/metadata-auth/auth-api/src/main/java/com/datahub/authentication/AuthenticationRequest.java index 91f15f9d5ae61d..5673bac5442b29 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authentication/AuthenticationRequest.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authentication/AuthenticationRequest.java @@ -1,6 +1,8 @@ package com.datahub.authentication; import com.datahub.plugins.auth.authentication.Authenticator; +import lombok.Getter; + import java.util.Map; import java.util.Objects; import java.util.TreeMap; @@ -13,14 +15,24 @@ * Currently, this class only hold the inbound request's headers, but could certainly be extended * to contain additional information like the request parameters, body, ip, etc as needed. */ +@Getter public class AuthenticationRequest { private final Map caseInsensitiveHeaders; + private final String servletInfo; + private final String pathInfo; + public AuthenticationRequest(@Nonnull final Map requestHeaders) { + this("", "", requestHeaders); + } + + public AuthenticationRequest(@Nonnull String servletInfo, @Nonnull String pathInfo, @Nonnull final Map requestHeaders) { Objects.requireNonNull(requestHeaders); caseInsensitiveHeaders = new TreeMap<>(String.CASE_INSENSITIVE_ORDER); caseInsensitiveHeaders.putAll(requestHeaders); + this.servletInfo = servletInfo; + this.pathInfo = pathInfo; } /** diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java index aec99e1b1e57a8..5a9990552bb34a 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizedActors.java @@ -15,6 +15,7 @@ public class AuthorizedActors { String privilege; List users; List groups; + List roles; boolean allUsers; boolean allGroups; } diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index a5af881022d8c9..e88fc870cb3331 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -101,6 +101,10 @@ def get_long_description(): f"acryl-datahub[testing-utils]{_self_pin}", # Extra requirements for loading our test dags. "apache-airflow[snowflake]>=2.0.2", + # Connexion's new version breaks Airflow: + # See https://github.com/apache/airflow/issues/35234. + # TODO: We should transition to using Airflow's constraints file. + "connexion<3", # https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350 # Eventually we want to set this to "snowflake-sqlalchemy>=1.4.3". # However, that doesn't work with older versions of Airflow. Instead diff --git a/metadata-ingestion/adding-source.md b/metadata-ingestion/adding-source.md index a0930102c6827c..6baddf6b2010dc 100644 --- a/metadata-ingestion/adding-source.md +++ b/metadata-ingestion/adding-source.md @@ -6,7 +6,7 @@ There are two ways of adding a metadata ingestion source. 2. You are writing the custom source for yourself and are not going to contribute back (yet). If you are going for case (1) just follow the steps 1 to 9 below. In case you are building it for yourself you can skip -steps 4-9 (but maybe write tests and docs for yourself as well) and follow the documentation +steps 4-8 (but maybe write tests and docs for yourself as well) and follow the documentation on [how to use custom ingestion sources](../docs/how/add-custom-ingestion-source.md) without forking Datahub. @@ -27,6 +27,7 @@ from `ConfigModel`. The [file source](./src/datahub/ingestion/source/file.py) is We use [pydantic](https://pydantic-docs.helpmanual.io) conventions for documenting configuration flags. Use the `description` attribute to write rich documentation for your configuration field. For example, the following code: + ```python from pydantic import Field from datahub.api.configuration.common import ConfigModel @@ -49,12 +50,10 @@ generates the following documentation:

- :::note Inline markdown or code snippets are not yet supported for field level documentation. ::: - ### 2. Set up the reporter The reporter interface enables the source to report statistics, warnings, failures, and other information about the run. @@ -71,6 +70,8 @@ some [convenience methods](./src/datahub/emitter/mce_builder.py) for commonly us ### 4. Set up the dependencies +Note: Steps 4-8 are only required if you intend to contribute the source back to the Datahub project. + Declare the source's pip dependencies in the `plugins` variable of the [setup script](./setup.py). ### 5. Enable discoverability @@ -119,37 +120,38 @@ from datahub.ingestion.api.decorators import ( @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") class FileSource(Source): """ - - The File Source can be used to produce all kinds of metadata from a generic metadata events file. + + The File Source can be used to produce all kinds of metadata from a generic metadata events file. :::note Events in this file can be in MCE form or MCP form. ::: - + """ ... source code goes here ``` - #### 7.2 Write custom documentation -- Create a copy of [`source-docs-template.md`](./source-docs-template.md) and edit all relevant components. +- Create a copy of [`source-docs-template.md`](./source-docs-template.md) and edit all relevant components. - Name the document as `` and move it to `metadata-ingestion/docs/sources//.md`. For example for the Kafka platform, under the `kafka` plugin, move the document to `metadata-ingestion/docs/sources/kafka/kafka.md`. - Add a quickstart recipe corresponding to the plugin under `metadata-ingestion/docs/sources//_recipe.yml`. For example, for the Kafka platform, under the `kafka` plugin, there is a quickstart recipe located at `metadata-ingestion/docs/sources/kafka/kafka_recipe.yml`. - To write platform-specific documentation (that is cross-plugin), write the documentation under `metadata-ingestion/docs/sources//README.md`. For example, cross-plugin documentation for the BigQuery platform is located under `metadata-ingestion/docs/sources/bigquery/README.md`. #### 7.3 Viewing the Documentation -Documentation for the source can be viewed by running the documentation generator from the `docs-website` module. +Documentation for the source can be viewed by running the documentation generator from the `docs-website` module. ##### Step 1: Build the Ingestion docs + ```console # From the root of DataHub repo ./gradlew :metadata-ingestion:docGen ``` If this finishes successfully, you will see output messages like: + ```console Ingestion Documentation Generation Complete ############################################ @@ -170,7 +172,8 @@ Ingestion Documentation Generation Complete You can also find documentation files generated at `./docs/generated/ingestion/sources` relative to the root of the DataHub repo. You should be able to locate your specific source's markdown file here and investigate it to make sure things look as expected. #### Step 2: Build the Entire Documentation -To view how this documentation looks in the browser, there is one more step. Just build the entire docusaurus page from the `docs-website` module. + +To view how this documentation looks in the browser, there is one more step. Just build the entire docusaurus page from the `docs-website` module. ```console # From the root of DataHub repo @@ -178,6 +181,7 @@ To view how this documentation looks in the browser, there is one more step. Jus ``` This will generate messages like: + ```console ... > Task :docs-website:yarnGenerate @@ -219,15 +223,15 @@ BUILD SUCCESSFUL in 35s 36 actionable tasks: 16 executed, 20 up-to-date ``` -After this you need to run the following script from the `docs-website` module. +After this you need to run the following script from the `docs-website` module. + ```console cd docs-website npm run serve ``` -Now, browse to http://localhost:3000 or whichever port npm is running on, to browse the docs. -Your source should show up on the left sidebar under `Metadata Ingestion / Sources`. - +Now, browse to http://localhost:3000 or whichever port npm is running on, to browse the docs. +Your source should show up on the left sidebar under `Metadata Ingestion / Sources`. ### 8. Add SQL Alchemy mapping (if applicable) diff --git a/metadata-ingestion/docs/sources/fivetran/fivetran_pre.md b/metadata-ingestion/docs/sources/fivetran/fivetran_pre.md new file mode 100644 index 00000000000000..949e408215e6f4 --- /dev/null +++ b/metadata-ingestion/docs/sources/fivetran/fivetran_pre.md @@ -0,0 +1,86 @@ +## Integration Details + +This source extracts the following: + +- Connectors in fivetran as Data Pipelines and Data Jobs to represent data lineage information between source and destination. +- Connector sources - DataJob input Datasets. +- Connector destination - DataJob output Datasets. +- Connector runs - DataProcessInstances as DataJob runs. + +## Configuration Notes + +1. Fivetran supports the fivetran platform connector to dump the log events and connectors, destinations, users and roles metadata in your destination. +2. You need to setup and start the initial sync of the fivetran platform connector before using this source. Refer [link](https://fivetran.com/docs/logs/fivetran-platform/setup-guide). +3. Once initial sync up of your fivetran platform connector is done, you need to provide the fivetran platform connector's destination platform and its configuration in the recipe. + +## Concept mapping + +| Fivetran | Datahub | +|--------------------------|--------------------------------------------------------------------------------------------------------| +| `Connector` | [DataJob](https://datahubproject.io/docs/generated/metamodel/entities/datajob/) | +| `Source` | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/) | +| `Destination` | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/) | +| `Connector Run` | [DataProcessInstance](https://datahubproject.io/docs/generated/metamodel/entities/dataprocessinstance) | + +Source and destination are mapped to Dataset as an Input and Output of Connector. + +## Current limitations + +Works only for Snowflake destination for now. + +## Snowflake destination Configuration Guide +1. If your fivetran platform connector destination is snowflake, you need to provide user details and its role with correct privileges in order to fetch metadata. +2. Snowflake system admin can follow this guide to create a fivetran_datahub role, assign it the required privileges, and assign it to a user by executing the following Snowflake commands from a user with the ACCOUNTADMIN role or MANAGE GRANTS privilege. + +```sql +create or replace role fivetran_datahub; + +// Grant access to a warehouse to run queries to view metadata +grant operate, usage on warehouse "" to role fivetran_datahub; + +// Grant access to view database and schema in which your log and metadata tables exist +grant usage on DATABASE "" to role fivetran_datahub; +grant usage on SCHEMA ""."" to role fivetran_datahub; + +// Grant access to execute select query on schema in which your log and metadata tables exist +grant select on all tables in SCHEMA ""."" to role fivetran_datahub; + +// Grant the fivetran_datahub to the snowflake user. +grant role fivetran_datahub to user snowflake_user; +``` + +## Advanced Configurations + +### Working with Platform Instances +If you've multiple instances of source/destination systems that are referred in your `fivetran` setup, you'd need to configure platform instance for these systems in `fivetran` recipe to generate correct lineage edges. Refer the document [Working with Platform Instances](https://datahubproject.io/docs/platform-instances) to understand more about this. + +While configuration of platform instance for source system you need to provide connector id as key and for destination system provide destination id as key. + +#### Example - Multiple Postgres Source Connectors each reading from different postgres instance +```yml + # Map of connector source to platform instance + sources_to_platform_instance: + postgres_connector_id1: + platform_instance: cloud_postgres_instance + env: PROD + + postgres_connector_id2: + platform_instance: local_postgres_instance + env: DEV +``` + +#### Example - Multiple Snowflake Destinations each writing to different snowflake instance +```yml + # Map of destination to platform instance + destination_to_platform_instance: + snowflake_destination_id1: + platform_instance: prod_snowflake_instance + env: PROD + + snowflake_destination_id2: + platform_instance: dev_snowflake_instance + env: PROD +``` + + + diff --git a/metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml b/metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml new file mode 100644 index 00000000000000..7c654df59723c1 --- /dev/null +++ b/metadata-ingestion/docs/sources/fivetran/fivetran_recipe.yml @@ -0,0 +1,43 @@ +source: + type: fivetran + config: + # Fivetran log connector destination server configurations + fivetran_log_config: + destination_platform: snowflake + destination_config: + # Coordinates + account_id: "abc48144" + warehouse: "COMPUTE_WH" + database: "MY_SNOWFLAKE_DB" + log_schema: "FIVETRAN_LOG" + + # Credentials + username: "${SNOWFLAKE_USER}" + password: "${SNOWFLAKE_PASS}" + role: "snowflake_role" + + # Optional - filter for certain connector names instead of ingesting everything. + # connector_patterns: + # allow: + # - connector_name + + # Optional -- A mapping of the connector's all sources to its database. + # sources_to_database: + # connector_id: source_db + + # Optional -- This mapping is optional and only required to configure platform-instance for source + # A mapping of Fivetran connector id to data platform instance + # sources_to_platform_instance: + # connector_id: + # platform_instance: cloud_instance + # env: DEV + + # Optional -- This mapping is optional and only required to configure platform-instance for destination. + # A mapping of Fivetran destination id to data platform instance + # destination_to_platform_instance: + # destination_id: + # platform_instance: cloud_instance + # env: DEV + +sink: + # sink configs diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index afce8dcee840b4..2392fce0580613 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -395,6 +395,7 @@ "powerbi-report-server": powerbi_report_server, "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.1"}, "unity-catalog": databricks | sqllineage_lib, + "fivetran": snowflake_common, } # This is mainly used to exclude plugins from the Docker image. @@ -525,6 +526,7 @@ "nifi", "vertica", "mode", + "fivetran", "kafka-connect", ] if plugin @@ -629,6 +631,7 @@ "unity-catalog = datahub.ingestion.source.unity.source:UnityCatalogSource", "gcs = datahub.ingestion.source.gcs.gcs_source:GCSSource", "sql-queries = datahub.ingestion.source.sql_queries:SqlQueriesSource", + "fivetran = datahub.ingestion.source.fivetran.fivetran:FivetranSource", ], "datahub.ingestion.transformer.plugins": [ "simple_remove_dataset_ownership = datahub.ingestion.transformer.remove_dataset_ownership:SimpleRemoveDatasetOwnership", diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py index 0face6415bacc4..6c42e830e223b1 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py @@ -100,7 +100,9 @@ def generate_tags_aspect(self) -> Iterable[GlobalTagsClass]: ) return [tags] - def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: + def generate_mcp( + self, materialize_iolets: bool = True + ) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), aspect=DataJobInfoClass( @@ -113,7 +115,9 @@ def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: ) yield mcp - yield from self.generate_data_input_output_mcp() + yield from self.generate_data_input_output_mcp( + materialize_iolets=materialize_iolets + ) for owner in self.generate_ownership_aspect(): mcp = MetadataChangeProposalWrapper( @@ -144,7 +148,9 @@ def emit( for mcp in self.generate_mcp(): emitter.emit(mcp, callback) - def generate_data_input_output_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: + def generate_data_input_output_mcp( + self, materialize_iolets: bool + ) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), aspect=DataJobInputOutputClass( @@ -157,10 +163,9 @@ def generate_data_input_output_mcp(self) -> Iterable[MetadataChangeProposalWrapp yield mcp # Force entity materialization - for iolet in self.inlets + self.outlets: - mcp = MetadataChangeProposalWrapper( - entityUrn=str(iolet), - aspect=StatusClass(removed=False), - ) - - yield mcp + if materialize_iolets: + for iolet in self.inlets + self.outlets: + yield MetadataChangeProposalWrapper( + entityUrn=str(iolet), + aspect=StatusClass(removed=False), + ) diff --git a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py index cf6080c7072e69..2f07e4a112f934 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py +++ b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py @@ -220,12 +220,10 @@ def emit_process_end( self._emit_mcp(mcp, emitter, callback) def generate_mcp( - self, created_ts_millis: Optional[int] = None + self, created_ts_millis: Optional[int] = None, materialize_iolets: bool = True ) -> Iterable[MetadataChangeProposalWrapper]: - """ - Generates mcps from the object - :rtype: Iterable[MetadataChangeProposalWrapper] - """ + """Generates mcps from the object""" + mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), aspect=DataProcessInstanceProperties( @@ -253,7 +251,7 @@ def generate_mcp( ) yield mcp - yield from self.generate_inlet_outlet_mcp() + yield from self.generate_inlet_outlet_mcp(materialize_iolets=materialize_iolets) @staticmethod def _emit_mcp( @@ -329,7 +327,9 @@ def from_dataflow(dataflow: DataFlow, id: str) -> "DataProcessInstance": dpi._template_object = dataflow return dpi - def generate_inlet_outlet_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: + def generate_inlet_outlet_mcp( + self, materialize_iolets: bool + ) -> Iterable[MetadataChangeProposalWrapper]: if self.inlets: mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), @@ -349,10 +349,9 @@ def generate_inlet_outlet_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: yield mcp # Force entity materialization - for iolet in self.inlets + self.outlets: - mcp = MetadataChangeProposalWrapper( - entityUrn=str(iolet), - aspect=StatusClass(removed=False), - ) - - yield mcp + if materialize_iolets: + for iolet in self.inlets + self.outlets: + yield MetadataChangeProposalWrapper( + entityUrn=str(iolet), + aspect=StatusClass(removed=False), + ) diff --git a/metadata-ingestion/src/datahub/cli/ingest_cli.py b/metadata-ingestion/src/datahub/cli/ingest_cli.py index 9b5716408f3e43..dd0287004a3686 100644 --- a/metadata-ingestion/src/datahub/cli/ingest_cli.py +++ b/metadata-ingestion/src/datahub/cli/ingest_cli.py @@ -27,7 +27,6 @@ from datahub.ingestion.run.pipeline import Pipeline from datahub.telemetry import telemetry from datahub.upgrade import upgrade -from datahub.utilities import memory_leak_detector logger = logging.getLogger(__name__) @@ -98,7 +97,6 @@ def ingest() -> None: @click.option( "--no-spinner", type=bool, is_flag=True, default=False, help="Turn off spinner" ) -@click.pass_context @telemetry.with_telemetry( capture_kwargs=[ "dry_run", @@ -109,9 +107,7 @@ def ingest() -> None: "no_spinner", ] ) -@memory_leak_detector.with_leak_detection def run( - ctx: click.Context, config: str, dry_run: bool, preview: bool, diff --git a/metadata-ingestion/src/datahub/emitter/mcp.py b/metadata-ingestion/src/datahub/emitter/mcp.py index 9085ac152ea0b2..d6aa695665e4ef 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp.py +++ b/metadata-ingestion/src/datahub/emitter/mcp.py @@ -240,7 +240,7 @@ def from_obj_require_wrapper( return mcp def as_workunit( - self, *, treat_errors_as_warnings: bool = False + self, *, treat_errors_as_warnings: bool = False, is_primary_source: bool = True ) -> "MetadataWorkUnit": from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -254,10 +254,12 @@ def as_workunit( id=f"{self.entityUrn}-{self.aspectName}-{ts}", mcp=self, treat_errors_as_warnings=treat_errors_as_warnings, + is_primary_source=is_primary_source, ) return MetadataWorkUnit( id=f"{self.entityUrn}-{self.aspectName}", mcp=self, treat_errors_as_warnings=treat_errors_as_warnings, + is_primary_source=is_primary_source, ) diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index 5bfab3b841fa38..0cd37cc9398549 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -70,21 +70,10 @@ version=datahub_package.nice_version_name(), prog_name=datahub_package.__package_name__, ) -@click.option( - "-dl", - "--detect-memory-leaks", - type=bool, - is_flag=True, - default=False, - help="Run memory leak detection.", -) -@click.pass_context def datahub( - ctx: click.Context, debug: bool, log_file: Optional[str], debug_vars: bool, - detect_memory_leaks: bool, ) -> None: if debug_vars: # debug_vars implies debug. This option isn't actually used here, but instead @@ -109,10 +98,6 @@ def datahub( _logging_configured = configure_logging(debug=debug, log_file=log_file) _logging_configured.__enter__() - # Setup the context for the memory_leak_detector decorator. - ctx.ensure_object(dict) - ctx.obj["detect_memory_leaks"] = detect_memory_leaks - @datahub.command() @telemetry.with_telemetry() diff --git a/metadata-ingestion/src/datahub/ingestion/api/decorators.py b/metadata-ingestion/src/datahub/ingestion/api/decorators.py index 5e4427047104fe..b390ffb9dd0362 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/decorators.py +++ b/metadata-ingestion/src/datahub/ingestion/api/decorators.py @@ -93,10 +93,20 @@ def capability( """ def wrapper(cls: Type) -> Type: - if not hasattr(cls, "__capabilities"): + if not hasattr(cls, "__capabilities") or any( + # It's from this class and not a superclass. + cls.__capabilities is getattr(base, "__capabilities", None) + for base in cls.__bases__ + ): cls.__capabilities = {} cls.get_capabilities = lambda: cls.__capabilities.values() + # If the superclasses have capability annotations, copy those over. + for base in cls.__bases__: + base_caps = getattr(base, "__capabilities", None) + if base_caps: + cls.__capabilities.update(base_caps) + cls.__capabilities[capability_name] = CapabilitySetting( capability=capability_name, description=description, supported=supported ) diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 2ce9e07bc57bc8..fae260226195ce 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -17,6 +17,7 @@ from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.emitter.mce_builder import make_dataplatform_instance_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.mcp_builder import entity_supports_aspect from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.schema_classes import ( BrowsePathEntryClass, @@ -64,9 +65,9 @@ def auto_status_aspect( """ For all entities that don't have a status aspect, add one with removed set to false. """ - all_urns: Set[str] = set() status_urns: Set[str] = set() + skip_urns: Set[str] = set() for wu in stream: urn = wu.get_urn() all_urns.add(urn) @@ -89,9 +90,17 @@ def auto_status_aspect( else: raise ValueError(f"Unexpected type {type(wu.metadata)}") + if not isinstance( + wu.metadata, MetadataChangeEventClass + ) and not entity_supports_aspect(wu.metadata.entityType, StatusClass): + # If any entity does not support aspect 'status' then skip that entity from adding status aspect. + # Example like dataProcessInstance doesn't suppport status aspect. + # If not skipped gives error: java.lang.RuntimeException: Unknown aspect status for entity dataProcessInstance + skip_urns.add(urn) + yield wu - for urn in sorted(all_urns - status_urns): + for urn in sorted(all_urns - status_urns - skip_urns): yield MetadataChangeProposalWrapper( entityUrn=urn, aspect=StatusClass(removed=False), diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py index 96184d8d445e4e..e4f1bb275487ea 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_database_reader.py @@ -1,9 +1,11 @@ import json import logging from datetime import datetime -from typing import Dict, Iterable, Optional, Tuple +from typing import Any, Generic, Iterable, List, Optional, Tuple, TypeVar from sqlalchemy import create_engine +from sqlalchemy.engine import Row +from typing_extensions import Protocol from datahub.emitter.aspect import ASPECT_MAP from datahub.emitter.mcp import MetadataChangeProposalWrapper @@ -20,6 +22,62 @@ DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f" +class VersionOrderable(Protocol): + createdon: Any # Should restrict to only orderable types + version: int + + +ROW = TypeVar("ROW", bound=VersionOrderable) + + +class VersionOrderer(Generic[ROW]): + """Orders rows by (createdon, version == 0). + + That is, orders rows first by createdon, and for equal timestamps, puts version 0 rows last. + """ + + def __init__(self, enabled: bool): + # Stores all version 0 aspects for a given createdon timestamp + # Once we have emitted all aspects for a given timestamp, we can emit the version 0 aspects + # Guaranteeing that, for a given timestamp, we always ingest version 0 aspects last + self.queue: Optional[Tuple[datetime, List[ROW]]] = None + self.enabled = enabled + + def __call__(self, rows: Iterable[ROW]) -> Iterable[ROW]: + for row in rows: + yield from self._process_row(row) + yield from self._flush_queue() + + def _process_row(self, row: ROW) -> Iterable[ROW]: + if not self.enabled: + yield row + return + + yield from self._attempt_queue_flush(row) + if row.version == 0: + self._add_to_queue(row) + else: + yield row + + def _add_to_queue(self, row: ROW) -> None: + if self.queue is None: + self.queue = (row.createdon, [row]) + else: + self.queue[1].append(row) + + def _attempt_queue_flush(self, row: ROW) -> Iterable[ROW]: + if self.queue is None: + return + + if row.createdon > self.queue[0]: + yield from self._flush_queue() + + def _flush_queue(self) -> Iterable[ROW]: + if self.queue is not None: + yield from self.queue[1] + self.queue = None + + class DataHubDatabaseReader: def __init__( self, @@ -40,13 +98,14 @@ def query(self) -> str: # Offset is generally 0, unless we repeat the same createdon twice # Ensures stable order, chronological per (urn, aspect) - # Version 0 last, only when createdon is the same. Otherwise relies on createdon order + # Relies on createdon order to reflect version order + # Ordering of entries with the same createdon is handled by VersionOrderer return f""" - SELECT urn, aspect, metadata, systemmetadata, createdon + SELECT urn, aspect, metadata, systemmetadata, createdon, version FROM {self.engine.dialect.identifier_preparer.quote(self.config.database_table_name)} WHERE createdon >= %(since_createdon)s {"" if self.config.include_all_versions else "AND version = 0"} - ORDER BY createdon, urn, aspect, CASE WHEN version = 0 THEN 1 ELSE 0 END, version + ORDER BY createdon, urn, aspect, version LIMIT %(limit)s OFFSET %(offset)s """ @@ -54,6 +113,14 @@ def query(self) -> str: def get_aspects( self, from_createdon: datetime, stop_time: datetime ) -> Iterable[Tuple[MetadataChangeProposalWrapper, datetime]]: + orderer = VersionOrderer[Row](enabled=self.config.include_all_versions) + rows = self._get_rows(from_createdon=from_createdon, stop_time=stop_time) + for row in orderer(rows): + mcp = self._parse_row(row) + if mcp: + yield mcp, row.createdon + + def _get_rows(self, from_createdon: datetime, stop_time: datetime) -> Iterable[Row]: with self.engine.connect() as conn: ts = from_createdon offset = 0 @@ -69,34 +136,31 @@ def get_aspects( return for i, row in enumerate(rows): - row_dict = row._asdict() - mcp = self._parse_row(row_dict) - if mcp: - yield mcp, row_dict["createdon"] + yield row - if ts == row_dict["createdon"]: - offset += i + if ts == row.createdon: + offset += i + 1 else: - ts = row_dict["createdon"] + ts = row.createdon offset = 0 - def _parse_row(self, d: Dict) -> Optional[MetadataChangeProposalWrapper]: + def _parse_row(self, row: Row) -> Optional[MetadataChangeProposalWrapper]: try: - json_aspect = post_json_transform(json.loads(d["metadata"])) - json_metadata = post_json_transform(json.loads(d["systemmetadata"] or "{}")) + json_aspect = post_json_transform(json.loads(row.metadata)) + json_metadata = post_json_transform(json.loads(row.systemmetadata or "{}")) system_metadata = SystemMetadataClass.from_obj(json_metadata) return MetadataChangeProposalWrapper( - entityUrn=d["urn"], - aspect=ASPECT_MAP[d["aspect"]].from_obj(json_aspect), + entityUrn=row.urn, + aspect=ASPECT_MAP[row.aspect].from_obj(json_aspect), systemMetadata=system_metadata, changeType=ChangeTypeClass.UPSERT, ) except Exception as e: logger.warning( - f"Failed to parse metadata for {d['urn']}: {e}", exc_info=True + f"Failed to parse metadata for {row.urn}: {e}", exc_info=True ) self.report.num_database_parse_errors += 1 self.report.database_parse_errors.setdefault( str(e), LossyDict() - ).setdefault(d["aspect"], LossyList()).append(d["urn"]) + ).setdefault(row.aspect, LossyList()).append(row.urn) return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/__init__.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py new file mode 100644 index 00000000000000..b0843182c5cac4 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/config.py @@ -0,0 +1,145 @@ +import logging +from dataclasses import dataclass, field as dataclass_field +from typing import Dict, List, Optional + +import pydantic +from pydantic import Field, root_validator + +from datahub.configuration.common import AllowDenyPattern, ConfigModel +from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalSourceReport, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionConfigBase, +) +from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig + +logger = logging.getLogger(__name__) + + +class Constant: + """ + keys used in fivetran plugin + """ + + ORCHESTRATOR = "fivetran" + # table column name + SOURCE_SCHEMA_NAME = "source_schema_name" + SOURCE_TABLE_NAME = "source_table_name" + SOURCE_TABLE_ID = "source_table_id" + SOURCE_COLUMN_NAME = "source_column_name" + DESTINATION_SCHEMA_NAME = "destination_schema_name" + DESTINATION_TABLE_NAME = "destination_table_name" + DESTINATION_TABLE_ID = "destination_table_id" + DESTINATION_COLUMN_NAME = "destination_column_name" + SYNC_ID = "sync_id" + MESSAGE_DATA = "message_data" + TIME_STAMP = "time_stamp" + STATUS = "status" + USER_ID = "user_id" + GIVEN_NAME = "given_name" + FAMILY_NAME = "family_name" + CONNECTOR_ID = "connector_id" + CONNECTOR_NAME = "connector_name" + CONNECTOR_TYPE_ID = "connector_type_id" + PAUSED = "paused" + SYNC_FREQUENCY = "sync_frequency" + DESTINATION_ID = "destination_id" + CONNECTING_USER_ID = "connecting_user_id" + # Job status constants + SUCCESSFUL = "SUCCESSFUL" + FAILURE_WITH_TASK = "FAILURE_WITH_TASK" + CANCELED = "CANCELED" + + +KNOWN_DATA_PLATFORM_MAPPING = { + "postgres": "postgres", + "snowflake": "snowflake", +} + + +class DestinationConfig(BaseSnowflakeConfig): + database: str = Field(description="The fivetran connector log database.") + log_schema: str = Field(description="The fivetran connector log schema.") + + +class FivetranLogConfig(ConfigModel): + destination_platform: str = pydantic.Field( + default="snowflake", + description="The destination platform where fivetran connector log tables are dumped.", + ) + destination_config: Optional[DestinationConfig] = pydantic.Field( + default=None, + description="If destination platform is 'snowflake', provide snowflake configuration.", + ) + + @root_validator(pre=True) + def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict: + destination_platform = values["destination_platform"] + if destination_platform == "snowflake": + if "destination_config" not in values: + raise ValueError( + "If destination platform is 'snowflake', user must provide snowflake destination configuration in the recipe." + ) + else: + raise ValueError( + f"Destination platform '{destination_platform}' is not yet supported." + ) + return values + + +@dataclass +class FivetranSourceReport(StaleEntityRemovalSourceReport): + connectors_scanned: int = 0 + filtered_connectors: List[str] = dataclass_field(default_factory=list) + + def report_connectors_scanned(self, count: int = 1) -> None: + self.connectors_scanned += count + + def report_connectors_dropped(self, model: str) -> None: + self.filtered_connectors.append(model) + + +class PlatformDetail(ConfigModel): + platform_instance: Optional[str] = pydantic.Field( + default=None, + description="The instance of the platform that all assets produced by this recipe belong to", + ) + env: str = pydantic.Field( + default=DEFAULT_ENV, + description="The environment that all assets produced by DataHub platform ingestion source belong to", + ) + + +class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): + fivetran_log_config: FivetranLogConfig = pydantic.Field( + description="Fivetran log connector destination server configurations.", + ) + connector_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description="Regex patterns for connectors to filter in ingestion.", + ) + include_column_lineage: bool = Field( + default=True, + description="Populates table->table column lineage.", + ) + sources_to_database: Dict[str, str] = pydantic.Field( + default={}, + description="A mapping of the connector's all sources to its database. Use connector id as key.", + ) + # Configuration for stateful ingestion + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field( + default=None, description="Airbyte Stateful Ingestion Config." + ) + # Fivetran connector all sources to platform instance mapping + sources_to_platform_instance: Dict[str, PlatformDetail] = pydantic.Field( + default={}, + description="A mapping of the connector's all sources dataset to platform instance. Use connector id as key.", + ) + # Fivetran destination to platform instance mapping + destination_to_platform_instance: Dict[str, PlatformDetail] = pydantic.Field( + default={}, + description="A mapping of destination dataset to platform instance. Use destination id as key.", + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py new file mode 100644 index 00000000000000..82bb5f3467c2a6 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/data_classes.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass +from typing import List + + +@dataclass +class ColumnLineage: + source_column: str + destination_column: str + + +@dataclass +class TableLineage: + source_table: str + destination_table: str + column_lineage: List[ColumnLineage] + + +@dataclass +class Connector: + connector_id: str + connector_name: str + connector_type: str + paused: bool + sync_frequency: int + destination_id: str + user_name: str + table_lineage: List[TableLineage] + jobs: List["Job"] + + +@dataclass +class Job: + job_id: str + start_time: int + end_time: int + status: str diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py new file mode 100644 index 00000000000000..c0395b4e4e7963 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran.py @@ -0,0 +1,289 @@ +import logging +from typing import Dict, Iterable, List, Optional + +import datahub.emitter.mce_builder as builder +from datahub.api.entities.datajob import DataFlow, DataJob +from datahub.api.entities.dataprocess.dataprocess_instance import ( + DataProcessInstance, + InstanceRunResult, +) +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.fivetran.config import ( + KNOWN_DATA_PLATFORM_MAPPING, + Constant, + FivetranSourceConfig, + FivetranSourceReport, + PlatformDetail, +) +from datahub.ingestion.source.fivetran.data_classes import Connector, Job +from datahub.ingestion.source.fivetran.fivetran_log_api import FivetranLogAPI +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + FineGrainedLineage, + FineGrainedLineageDownstreamType, + FineGrainedLineageUpstreamType, +) +from datahub.metadata.schema_classes import StatusClass +from datahub.utilities.urns.data_flow_urn import DataFlowUrn +from datahub.utilities.urns.dataset_urn import DatasetUrn + +# Logger instance +logger = logging.getLogger(__name__) + + +@platform_name("Fivetran") +@config_class(FivetranSourceConfig) +@support_status(SupportStatus.INCUBATING) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability( + SourceCapability.LINEAGE_FINE, + "Enabled by default, can be disabled via configuration `include_column_lineage`", +) +class FivetranSource(StatefulIngestionSourceBase): + """ + This plugin extracts fivetran users, connectors, destinations and sync history. + This plugin is in beta and has only been tested on Snowflake connector. + """ + + config: FivetranSourceConfig + report: FivetranSourceReport + platform: str = "fivetran" + + def __init__(self, config: FivetranSourceConfig, ctx: PipelineContext): + super(FivetranSource, self).__init__(config, ctx) + self.config = config + self.report = FivetranSourceReport() + + self.audit_log = FivetranLogAPI(self.config.fivetran_log_config) + + # Create and register the stateful ingestion use-case handler. + self.stale_entity_removal_handler = StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ) + + def _extend_lineage(self, connector: Connector, datajob: DataJob) -> None: + input_dataset_urn_list: List[DatasetUrn] = [] + output_dataset_urn_list: List[DatasetUrn] = [] + fine_grained_lineage: List[FineGrainedLineage] = [] + + source_platform_detail: PlatformDetail = PlatformDetail() + destination_platform_detail: PlatformDetail = PlatformDetail() + # Get platform details for connector source + source_platform_detail = self.config.sources_to_platform_instance.get( + connector.connector_id, PlatformDetail() + ) + + # Get platform details for destination + destination_platform_detail = self.config.destination_to_platform_instance.get( + connector.destination_id, PlatformDetail() + ) + + # Get database for connector source + # TODO: Once Fivetran exposes this, we shouldn't ask for it via config. + source_database: Optional[str] = self.config.sources_to_database.get( + connector.connector_id + ) + + if connector.connector_type in KNOWN_DATA_PLATFORM_MAPPING: + source_platform = KNOWN_DATA_PLATFORM_MAPPING[connector.connector_type] + else: + source_platform = connector.connector_type + logger.info( + f"Fivetran connector source type: {connector.connector_type} is not supported to mapped with Datahub dataset entity." + ) + + for table_lineage in connector.table_lineage: + input_dataset_urn = DatasetUrn.create_from_ids( + platform_id=source_platform, + table_name=f"{source_database.lower()}.{table_lineage.source_table}" + if source_database + else table_lineage.source_table, + env=source_platform_detail.env, + platform_instance=source_platform_detail.platform_instance, + ) + input_dataset_urn_list.append(input_dataset_urn) + + output_dataset_urn: Optional[DatasetUrn] = None + if self.audit_log.fivetran_log_database: + output_dataset_urn = DatasetUrn.create_from_ids( + platform_id=self.config.fivetran_log_config.destination_platform, + table_name=f"{self.audit_log.fivetran_log_database.lower()}.{table_lineage.destination_table}", + env=destination_platform_detail.env, + platform_instance=destination_platform_detail.platform_instance, + ) + output_dataset_urn_list.append(output_dataset_urn) + + if self.config.include_column_lineage: + for column_lineage in table_lineage.column_lineage: + fine_grained_lineage.append( + FineGrainedLineage( + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=[ + builder.make_schema_field_urn( + str(input_dataset_urn), + column_lineage.source_column, + ) + ] + if input_dataset_urn + else [], + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=[ + builder.make_schema_field_urn( + str(output_dataset_urn), + column_lineage.destination_column, + ) + ] + if output_dataset_urn + else [], + ) + ) + + datajob.inlets.extend(input_dataset_urn_list) + datajob.outlets.extend(output_dataset_urn_list) + datajob.fine_grained_lineages.extend(fine_grained_lineage) + return None + + def _generate_dataflow_from_connector(self, connector: Connector) -> DataFlow: + return DataFlow( + orchestrator=Constant.ORCHESTRATOR, + id=connector.connector_id, + env=self.config.env, + name=connector.connector_name, + platform_instance=self.config.platform_instance, + ) + + def _generate_datajob_from_connector(self, connector: Connector) -> DataJob: + dataflow_urn = DataFlowUrn.create_from_ids( + orchestrator=Constant.ORCHESTRATOR, + flow_id=connector.connector_id, + env=self.config.env, + platform_instance=self.config.platform_instance, + ) + datajob = DataJob( + id=connector.connector_id, + flow_urn=dataflow_urn, + name=connector.connector_name, + owners={connector.user_name}, + ) + + job_property_bag: Dict[str, str] = {} + allowed_connection_keys = [ + Constant.PAUSED, + Constant.SYNC_FREQUENCY, + Constant.DESTINATION_ID, + ] + for key in allowed_connection_keys: + if hasattr(connector, key) and getattr(connector, key) is not None: + job_property_bag[key] = repr(getattr(connector, key)) + datajob.properties = job_property_bag + + # Map connector source and destination table with dataset entity + # Also extend the fine grained lineage of column if include_column_lineage is True + self._extend_lineage(connector=connector, datajob=datajob) + + # TODO: Add fine grained lineages of dataset after FineGrainedLineageDownstreamType.DATASET enabled + + return datajob + + def _generate_dpi_from_job(self, job: Job, datajob: DataJob) -> DataProcessInstance: + return DataProcessInstance.from_datajob( + datajob=datajob, + id=job.job_id, + clone_inlets=True, + clone_outlets=True, + ) + + def _get_dpi_workunits( + self, job: Job, dpi: DataProcessInstance + ) -> Iterable[MetadataWorkUnit]: + status_result_map: Dict[str, InstanceRunResult] = { + Constant.SUCCESSFUL: InstanceRunResult.SUCCESS, + Constant.FAILURE_WITH_TASK: InstanceRunResult.FAILURE, + Constant.CANCELED: InstanceRunResult.SKIPPED, + } + if job.status not in status_result_map: + logger.debug( + f"Status should be either SUCCESSFUL, FAILURE_WITH_TASK or CANCELED and it was " + f"{job.status}" + ) + return [] + result = status_result_map[job.status] + start_timestamp_millis = job.start_time * 1000 + for mcp in dpi.generate_mcp( + created_ts_millis=start_timestamp_millis, materialize_iolets=False + ): + yield mcp.as_workunit() + for mcp in dpi.start_event_mcp(start_timestamp_millis): + yield mcp.as_workunit() + for mcp in dpi.end_event_mcp( + end_timestamp_millis=job.end_time * 1000, + result=result, + result_type=Constant.ORCHESTRATOR, + ): + yield mcp.as_workunit() + + def _get_connector_workunits( + self, connector: Connector + ) -> Iterable[MetadataWorkUnit]: + self.report.report_connectors_scanned() + # Create dataflow entity with same name as connector name + dataflow = self._generate_dataflow_from_connector(connector) + for mcp in dataflow.generate_mcp(): + yield mcp.as_workunit() + + # Map Fivetran's connector entity with Datahub's datajob entity + datajob = self._generate_datajob_from_connector(connector) + for mcp in datajob.generate_mcp(materialize_iolets=True): + if mcp.entityType == "dataset" and isinstance(mcp.aspect, StatusClass): + # While we "materialize" the referenced datasets, we don't want them + # to be tracked by stateful ingestion. + yield mcp.as_workunit(is_primary_source=False) + else: + yield mcp.as_workunit() + + # Map Fivetran's job/sync history entity with Datahub's data process entity + for job in connector.jobs: + dpi = self._generate_dpi_from_job(job, datajob) + yield from self._get_dpi_workunits(job, dpi) + + @classmethod + def create(cls, config_dict: dict, ctx: PipelineContext) -> Source: + config = FivetranSourceConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + self.stale_entity_removal_handler.workunit_processor, + ] + + def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + """ + Datahub Ingestion framework invoke this method + """ + logger.info("Fivetran plugin execution is started") + connectors = self.audit_log.get_connectors_list() + for connector in connectors: + if not self.config.connector_patterns.allowed(connector.connector_name): + self.report.report_connectors_dropped(connector.connector_name) + continue + logger.info(f"Processing connector id: {connector.connector_id}") + yield from self._get_connector_workunits(connector) + + def get_report(self) -> SourceReport: + return self.report diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py new file mode 100644 index 00000000000000..d5d146559d9183 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_log_api.py @@ -0,0 +1,147 @@ +import json +import logging +from typing import Any, Dict, List, Optional + +from sqlalchemy import create_engine + +from datahub.ingestion.source.fivetran.config import Constant, FivetranLogConfig +from datahub.ingestion.source.fivetran.data_classes import ( + ColumnLineage, + Connector, + Job, + TableLineage, +) +from datahub.ingestion.source.fivetran.fivetran_query import FivetranLogQuery + +logger: logging.Logger = logging.getLogger(__name__) + + +class FivetranLogAPI: + def __init__(self, fivetran_log_config: FivetranLogConfig) -> None: + self.fivetran_log_database: Optional[str] = None + self.fivetran_log_config = fivetran_log_config + self.engine = self._get_log_destination_engine() + + def _get_log_destination_engine(self) -> Any: + destination_platform = self.fivetran_log_config.destination_platform + engine = None + # For every destination, create sqlalchemy engine, + # select the database and schema and set fivetran_log_database class variable + if destination_platform == "snowflake": + snowflake_destination_config = self.fivetran_log_config.destination_config + if snowflake_destination_config is not None: + engine = create_engine( + snowflake_destination_config.get_sql_alchemy_url(), + **snowflake_destination_config.get_options(), + ) + engine.execute( + FivetranLogQuery.use_schema( + snowflake_destination_config.database, + snowflake_destination_config.log_schema, + ) + ) + self.fivetran_log_database = snowflake_destination_config.database + return engine + + def _query(self, query: str) -> List[Dict]: + logger.debug("Query : {}".format(query)) + resp = self.engine.execute(query) + return [row for row in resp] + + def _get_table_lineage(self, connector_id: str) -> List[TableLineage]: + table_lineage_result = self._query( + FivetranLogQuery.get_table_lineage_query(connector_id=connector_id) + ) + table_lineage_list: List[TableLineage] = [] + for table_lineage in table_lineage_result: + column_lineage_result = self._query( + FivetranLogQuery.get_column_lineage_query( + source_table_id=table_lineage[Constant.SOURCE_TABLE_ID], + destination_table_id=table_lineage[Constant.DESTINATION_TABLE_ID], + ) + ) + column_lineage_list: List[ColumnLineage] = [ + ColumnLineage( + source_column=column_lineage[Constant.SOURCE_COLUMN_NAME], + destination_column=column_lineage[Constant.DESTINATION_COLUMN_NAME], + ) + for column_lineage in column_lineage_result + ] + table_lineage_list.append( + TableLineage( + source_table=f"{table_lineage[Constant.SOURCE_SCHEMA_NAME]}.{table_lineage[Constant.SOURCE_TABLE_NAME]}", + destination_table=f"{table_lineage[Constant.DESTINATION_SCHEMA_NAME]}.{table_lineage[Constant.DESTINATION_TABLE_NAME]}", + column_lineage=column_lineage_list, + ) + ) + + return table_lineage_list + + def _get_jobs_list(self, connector_id: str) -> List[Job]: + jobs: List[Job] = [] + sync_start_logs = { + row[Constant.SYNC_ID]: row + for row in self._query( + FivetranLogQuery.get_sync_start_logs_query(connector_id=connector_id) + ) + } + sync_end_logs = { + row[Constant.SYNC_ID]: row + for row in self._query( + FivetranLogQuery.get_sync_end_logs_query(connector_id=connector_id) + ) + } + for sync_id in sync_start_logs.keys(): + if sync_end_logs.get(sync_id) is None: + # If no sync-end event log for this sync id that means sync is still in progress + continue + + message_data = json.loads(sync_end_logs[sync_id][Constant.MESSAGE_DATA]) + if isinstance(message_data, str): + # Sometimes message_data contains json string inside string + # Ex: '"{\"status\":\"SUCCESSFUL\"}"' + # Hence, need to do json loads twice. + message_data = json.loads(message_data) + + jobs.append( + Job( + job_id=sync_id, + start_time=round( + sync_start_logs[sync_id][Constant.TIME_STAMP].timestamp() + ), + end_time=round( + sync_end_logs[sync_id][Constant.TIME_STAMP].timestamp() + ), + status=message_data[Constant.STATUS], + ) + ) + return jobs + + def _get_user_name(self, user_id: str) -> str: + user_details = self._query(FivetranLogQuery.get_user_query(user_id=user_id))[0] + return ( + f"{user_details[Constant.GIVEN_NAME]} {user_details[Constant.FAMILY_NAME]}" + ) + + def get_connectors_list(self) -> List[Connector]: + connectors: List[Connector] = [] + connector_list = self._query(FivetranLogQuery.get_connectors_query()) + for connector in connector_list: + connectors.append( + Connector( + connector_id=connector[Constant.CONNECTOR_ID], + connector_name=connector[Constant.CONNECTOR_NAME], + connector_type=connector[Constant.CONNECTOR_TYPE_ID], + paused=connector[Constant.PAUSED], + sync_frequency=connector[Constant.SYNC_FREQUENCY], + destination_id=connector[Constant.DESTINATION_ID], + user_name=self._get_user_name( + connector[Constant.CONNECTING_USER_ID] + ), + table_lineage=self._get_table_lineage( + connector[Constant.CONNECTOR_ID] + ), + jobs=self._get_jobs_list(connector[Constant.CONNECTOR_ID]), + ) + ) + return connectors diff --git a/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py new file mode 100644 index 00000000000000..4f52fcd5d884fb --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/fivetran/fivetran_query.py @@ -0,0 +1,76 @@ +class FivetranLogQuery: + @staticmethod + def use_schema(db_name: str, schema_name: str) -> str: + return f'use schema "{db_name}"."{schema_name}"' + + @staticmethod + def get_connectors_query() -> str: + return """ + SELECT connector_id as "CONNECTOR_ID", + connecting_user_id as "CONNECTING_USER_ID", + connector_type_id as "CONNECTOR_TYPE_ID", + connector_name as "CONNECTOR_NAME", + paused as "PAUSED", + sync_frequency as "SYNC_FREQUENCY", + destination_id as "DESTINATION_ID" + FROM CONNECTOR + WHERE _fivetran_deleted = FALSE""" + + @staticmethod + def get_user_query(user_id: str) -> str: + return f""" + SELECT id as "USER_ID", + given_name as "GIVEN_NAME", + family_name as "FAMILY_NAME" + FROM USER + WHERE id = '{user_id}'""" + + @staticmethod + def get_sync_start_logs_query( + connector_id: str, + ) -> str: + return f""" + SELECT time_stamp as "TIME_STAMP", + sync_id as "SYNC_ID" + FROM LOG + WHERE message_event = 'sync_start' + and connector_id = '{connector_id}' order by time_stamp""" + + @staticmethod + def get_sync_end_logs_query(connector_id: str) -> str: + return f""" + SELECT time_stamp as "TIME_STAMP", + sync_id as "SYNC_ID", + message_data as "MESSAGE_DATA" + FROM LOG + WHERE message_event = 'sync_end' + and connector_id = '{connector_id}' order by time_stamp""" + + @staticmethod + def get_table_lineage_query(connector_id: str) -> str: + return f""" + SELECT stm.id as "SOURCE_TABLE_ID", + stm.name as "SOURCE_TABLE_NAME", + ssm.name as "SOURCE_SCHEMA_NAME", + dtm.id as "DESTINATION_TABLE_ID", + dtm.name as "DESTINATION_TABLE_NAME", + dsm.name as "DESTINATION_SCHEMA_NAME" + FROM table_lineage as tl + JOIN source_table_metadata as stm on tl.source_table_id = stm.id + JOIN destination_table_metadata as dtm on tl.destination_table_id = dtm.id + JOIN source_schema_metadata as ssm on stm.schema_id = ssm.id + JOIN destination_schema_metadata as dsm on dtm.schema_id = dsm.id + WHERE stm.connector_id = '{connector_id}'""" + + @staticmethod + def get_column_lineage_query( + source_table_id: str, destination_table_id: str + ) -> str: + return f""" + SELECT scm.name as "SOURCE_COLUMN_NAME", + dcm.name as "DESTINATION_COLUMN_NAME" + FROM column_lineage as cl + JOIN source_column_metadata as scm on + (cl.source_column_id = scm.id and scm.table_id = {source_table_id}) + JOIN destination_column_metadata as dcm on + (cl.destination_column_id = dcm.id and dcm.table_id = {destination_table_id})""" diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 30c38720dd96c4..7ca5ce49019abd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -388,7 +388,7 @@ def _get_field_type( # if still not found, log and continue if type_class is None: - logger.info( + logger.debug( f"The type '{native_type}' is not recognized for field type, setting as NullTypeClass.", ) type_class = NullTypeClass diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py index 96c405f7257d04..e6ddea9a30489e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_config.py @@ -121,7 +121,10 @@ class LookerCommonConfig(DatasetSourceConfigMixin): "discoverable. When disabled, adds this information to the description of the column.", ) platform_name: str = Field( - "looker", description="Default platform name. Don't change." + # TODO: This shouldn't be part of the config. + "looker", + description="Default platform name.", + hidden_from_docs=True, ) extract_column_level_lineage: bool = Field( True, @@ -202,6 +205,10 @@ class LookerDashboardSourceConfig( False, description="Extract looks which are not part of any Dashboard. To enable this flag the stateful_ingestion should also be enabled.", ) + emit_used_explores_only: bool = Field( + True, + description="When enabled, only explores that are used by a Dashboard/Look will be ingested.", + ) @validator("external_base_url", pre=True, always=True) def external_url_defaults_to_api_config_base_url( @@ -213,7 +220,6 @@ def external_url_defaults_to_api_config_base_url( def stateful_ingestion_should_be_enabled( cls, v: Optional[bool], *, values: Dict[str, Any], **kwargs: Dict[str, Any] ) -> Optional[bool]: - stateful_ingestion: StatefulStaleMetadataRemovalConfig = cast( StatefulStaleMetadataRemovalConfig, values.get("stateful_ingestion") ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py index b00f74b71e7922..988caba1c0d748 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py @@ -59,6 +59,7 @@ class LookerAPIStats(BaseModel): lookml_model_calls: int = 0 all_dashboards_calls: int = 0 all_looks_calls: int = 0 + all_models_calls: int = 0 get_query_calls: int = 0 search_looks_calls: int = 0 search_dashboards_calls: int = 0 @@ -155,6 +156,12 @@ def dashboard(self, dashboard_id: str, fields: Union[str, List[str]]) -> Dashboa transport_options=self.transport_options, ) + def all_lookml_models(self) -> Sequence[LookmlModel]: + self.client_stats.all_models_calls += 1 + return self.client.all_lookml_models( + transport_options=self.transport_options, + ) + def lookml_model_explore(self, model: str, explore_name: str) -> LookmlModelExplore: self.client_stats.explore_calls += 1 return self.client.lookml_model_explore( diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 09683d790c14c7..4a98e8874bca0d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -147,9 +147,12 @@ def __init__(self, config: LookerDashboardSourceConfig, ctx: PipelineContext): ) self.reporter._looker_explore_registry = self.explore_registry self.reporter._looker_api = self.looker_api + self.reachable_look_registry = set() - self.explores_to_fetch_set: Dict[Tuple[str, str], List[str]] = {} + # (model, explore) -> list of charts/looks/dashboards that reference this explore + # The list values are used purely for debugging purposes. + self.reachable_explores: Dict[Tuple[str, str], List[str]] = {} # Keep stat generators to generate entity stat aspect later stat_generator_config: looker_usage.StatGeneratorConfig = ( @@ -378,11 +381,11 @@ def _get_input_fields_from_query( return result - def add_explore_to_fetch(self, model: str, explore: str, via: str) -> None: - if (model, explore) not in self.explores_to_fetch_set: - self.explores_to_fetch_set[(model, explore)] = [] + def add_reachable_explore(self, model: str, explore: str, via: str) -> None: + if (model, explore) not in self.reachable_explores: + self.reachable_explores[(model, explore)] = [] - self.explores_to_fetch_set[(model, explore)].append(via) + self.reachable_explores[(model, explore)].append(via) def _get_looker_dashboard_element( # noqa: C901 self, element: DashboardElement @@ -403,7 +406,7 @@ def _get_looker_dashboard_element( # noqa: C901 f"Element {element.title}: Explores added via query: {explores}" ) for exp in explores: - self.add_explore_to_fetch( + self.add_reachable_explore( model=element.query.model, explore=exp, via=f"look:{element.look_id}:query:{element.dashboard_id}", @@ -439,7 +442,7 @@ def _get_looker_dashboard_element( # noqa: C901 explores = [element.look.query.view] logger.debug(f"Element {title}: Explores added via look: {explores}") for exp in explores: - self.add_explore_to_fetch( + self.add_reachable_explore( model=element.look.query.model, explore=exp, via=f"Look:{element.look_id}:query:{element.dashboard_id}", @@ -483,7 +486,7 @@ def _get_looker_dashboard_element( # noqa: C901 ) for exp in explores: - self.add_explore_to_fetch( + self.add_reachable_explore( model=element.result_maker.query.model, explore=exp, via=f"Look:{element.look_id}:resultmaker:query", @@ -495,7 +498,7 @@ def _get_looker_dashboard_element( # noqa: C901 if filterable.view is not None and filterable.model is not None: model = filterable.model explores.append(filterable.view) - self.add_explore_to_fetch( + self.add_reachable_explore( model=filterable.model, explore=filterable.view, via=f"Look:{element.look_id}:resultmaker:filterable", @@ -694,20 +697,26 @@ def _make_dashboard_metadata_events( def _make_explore_metadata_events( self, ) -> Iterable[Union[MetadataChangeEvent, MetadataChangeProposalWrapper]]: + if self.source_config.emit_used_explores_only: + explores_to_fetch = list(self.reachable_explores.keys()) + else: + explores_to_fetch = list(self.list_all_explores()) + explores_to_fetch.sort() + with concurrent.futures.ThreadPoolExecutor( max_workers=self.source_config.max_threads ) as async_executor: - self.reporter.total_explores = len(self.explores_to_fetch_set) + self.reporter.total_explores = len(explores_to_fetch) explore_futures = { async_executor.submit(self.fetch_one_explore, model, explore): ( model, explore, ) - for (model, explore) in self.explores_to_fetch_set + for (model, explore) in explores_to_fetch } - for future in concurrent.futures.as_completed(explore_futures): + for future in concurrent.futures.wait(explore_futures).done: events, explore_id, start_time, end_time = future.result() del explore_futures[future] self.reporter.explores_scanned += 1 @@ -717,6 +726,17 @@ def _make_explore_metadata_events( f"Running time of fetch_one_explore for {explore_id}: {(end_time - start_time).total_seconds()}" ) + def list_all_explores(self) -> Iterable[Tuple[str, str]]: + # returns a list of (model, explore) tuples + + for model in self.looker_api.all_lookml_models(): + if model.name is None or model.explores is None: + continue + for explore in model.explores: + if explore.name is None: + continue + yield (model.name, explore.name) + def fetch_one_explore( self, model: str, explore: str ) -> Tuple[ @@ -954,7 +974,7 @@ def _input_fields_from_dashboard_element( ) if explore is not None: # add this to the list of explores to finally generate metadata for - self.add_explore_to_fetch( + self.add_reachable_explore( input_field.model, input_field.explore, entity_urn ) entity_urn = explore.get_explore_urn(self.source_config) diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py index 7fb2cf9813cab1..d11b1f9ad6a537 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py @@ -15,11 +15,12 @@ from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import capability from datahub.ingestion.api.ingestion_job_checkpointing_provider_base import ( IngestionCheckpointingProviderBase, JobId, ) -from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.source.state.checkpoint import Checkpoint, StateType from datahub.ingestion.source.state.use_case_handler import ( StatefulIngestionUsecaseHandlerBase, @@ -177,6 +178,11 @@ class StatefulIngestionReport(SourceReport): pass +@capability( + SourceCapability.DELETION_DETECTION, + "Optionally enabled via `stateful_ingestion.remove_stale_metadata`", + supported=True, +) class StatefulIngestionSourceBase(Source): """ Defines the base class for all stateful sources. diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 4bc40b0aac9649..08df7599510f47 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -59,7 +59,7 @@ ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source import tableau_constant +from datahub.ingestion.source import tableau_constant as c from datahub.ingestion.source.common.subtypes import ( BIContainerSubTypes, DatasetSubTypes, @@ -720,16 +720,12 @@ def get_connection_object_page( query, connection_type, query_filter, count, offset, False ) - if tableau_constant.ERRORS in query_data: - errors = query_data[tableau_constant.ERRORS] + if c.ERRORS in query_data: + errors = query_data[c.ERRORS] if all( # The format of the error messages is highly unpredictable, so we have to # be extra defensive with our parsing. - error - and (error.get(tableau_constant.EXTENSIONS) or {}).get( - tableau_constant.SEVERITY - ) - == tableau_constant.WARNING + error and (error.get(c.EXTENSIONS) or {}).get(c.SEVERITY) == c.WARNING for error in errors ): self.report.report_warning(key=connection_type, reason=f"{errors}") @@ -737,14 +733,14 @@ def get_connection_object_page( raise RuntimeError(f"Query {connection_type} error: {errors}") connection_object = ( - query_data.get(tableau_constant.DATA).get(connection_type, {}) - if query_data.get(tableau_constant.DATA) + query_data.get(c.DATA).get(connection_type, {}) + if query_data.get(c.DATA) else {} ) - total_count = connection_object.get(tableau_constant.TOTAL_COUNT, 0) - has_next_page = connection_object.get(tableau_constant.PAGE_INFO, {}).get( - tableau_constant.HAS_NEXT_PAGE, False + total_count = connection_object.get(c.TOTAL_COUNT, 0) + has_next_page = connection_object.get(c.PAGE_INFO, {}).get( + c.HAS_NEXT_PAGE, False ) return connection_object, total_count, has_next_page @@ -781,7 +777,7 @@ def get_connection_objects( offset += count - for obj in connection_objects.get(tableau_constant.NODES) or []: + for obj in connection_objects.get(c.NODES) or []: yield obj def emit_workbooks(self) -> Iterable[MetadataWorkUnit]: @@ -790,11 +786,11 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]: project.name for project in self.tableau_project_registry.values() ] project_names_str: str = json.dumps(project_names) - projects = f"{tableau_constant.PROJECT_NAME_WITH_IN}: {project_names_str}" + projects = f"{c.PROJECT_NAME_WITH_IN}: {project_names_str}" for workbook in self.get_connection_objects( workbook_graphql_query, - tableau_constant.WORKBOOKS_CONNECTION, + c.WORKBOOKS_CONNECTION, projects, page_size_override=self.config.workbook_page_size, ): @@ -804,11 +800,9 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]: # however Tableau supports projectLuidWithin in Tableau Cloud June 2022 / Server 2022.3 and later. project_luid: Optional[str] = self._get_workbook_project_luid(workbook) if project_luid not in self.tableau_project_registry.keys(): - wrk_name: Optional[str] = workbook.get(tableau_constant.NAME) - wrk_id: Optional[str] = workbook.get(tableau_constant.ID) - prj_name: Optional[str] = workbook.get( - tableau_constant.PROJECT_NAME - ) + wrk_name: Optional[str] = workbook.get(c.NAME) + wrk_id: Optional[str] = workbook.get(c.ID) + prj_name: Optional[str] = workbook.get(c.PROJECT_NAME) logger.debug( f"Skipping workbook {wrk_name}({wrk_id}) as it is project {prj_name}({project_luid}) not " @@ -818,25 +812,22 @@ def emit_workbooks(self) -> Iterable[MetadataWorkUnit]: yield from self.emit_workbook_as_container(workbook) - for sheet in workbook.get(tableau_constant.SHEETS, []): - self.sheet_ids.append(sheet[tableau_constant.ID]) + for sheet in workbook.get(c.SHEETS, []): + self.sheet_ids.append(sheet[c.ID]) - for dashboard in workbook.get(tableau_constant.DASHBOARDS, []): - self.dashboard_ids.append(dashboard[tableau_constant.ID]) + for dashboard in workbook.get(c.DASHBOARDS, []): + self.dashboard_ids.append(dashboard[c.ID]) - for ds in workbook.get(tableau_constant.EMBEDDED_DATA_SOURCES, []): - self.embedded_datasource_ids_being_used.append( - ds[tableau_constant.ID] - ) + for ds in workbook.get(c.EMBEDDED_DATA_SOURCES, []): + self.embedded_datasource_ids_being_used.append(ds[c.ID]) def _track_custom_sql_ids(self, field: dict) -> None: # Tableau shows custom sql datasource as a table in ColumnField's upstreamColumns. - for column in field.get(tableau_constant.UPSTREAM_COLUMNS, []): + for column in field.get(c.UPSTREAM_COLUMNS, []): table_id = ( - column.get(tableau_constant.TABLE, {}).get(tableau_constant.ID) - if column.get(tableau_constant.TABLE) - and column[tableau_constant.TABLE][tableau_constant.TYPE_NAME] - == tableau_constant.CUSTOM_SQL_TABLE + column.get(c.TABLE, {}).get(c.ID) + if column.get(c.TABLE) + and column[c.TABLE][c.TYPE_NAME] == c.CUSTOM_SQL_TABLE else None ) @@ -861,15 +852,15 @@ def _create_upstream_table_lineage( # and published datasource have same upstreamTables in this case. if upstream_tables and is_embedded_ds: logger.debug( - f"Embedded datasource {datasource.get(tableau_constant.ID)} has upstreamDatasources.\ + f"Embedded datasource {datasource.get(c.ID)} has upstreamDatasources.\ Setting only upstreamDatasources lineage. The upstreamTables lineage \ will be set via upstream published datasource." ) else: # This adds an edge to upstream DatabaseTables using `upstreamTables` upstreams, id_to_urn = self.get_upstream_tables( - datasource.get(tableau_constant.UPSTREAM_TABLES, []), - datasource.get(tableau_constant.NAME), + datasource.get(c.UPSTREAM_TABLES, []), + datasource.get(c.NAME), browse_path, is_custom_sql=False, ) @@ -878,23 +869,23 @@ def _create_upstream_table_lineage( # This adds an edge to upstream CustomSQLTables using `fields`.`upstreamColumns`.`table` csql_upstreams, csql_id_to_urn = self.get_upstream_csql_tables( - datasource.get(tableau_constant.FIELDS) or [], + datasource.get(c.FIELDS) or [], ) upstream_tables.extend(csql_upstreams) table_id_to_urn.update(csql_id_to_urn) logger.debug( - f"A total of {len(upstream_tables)} upstream table edges found for datasource {datasource[tableau_constant.ID]}" + f"A total of {len(upstream_tables)} upstream table edges found for datasource {datasource[c.ID]}" ) datasource_urn = builder.make_dataset_urn_with_platform_instance( platform=self.platform, - name=datasource[tableau_constant.ID], + name=datasource[c.ID], platform_instance=self.config.platform_instance, env=self.config.env, ) - if datasource.get(tableau_constant.FIELDS): + if datasource.get(c.FIELDS): if self.config.extract_column_level_lineage: # Find fine grained lineage for datasource column to datasource column edge, # upstream columns may be from same datasource @@ -912,20 +903,20 @@ def _create_upstream_table_lineage( fine_grained_lineages.extend(upstream_columns) logger.debug( - f"A total of {len(fine_grained_lineages)} upstream column edges found for datasource {datasource[tableau_constant.ID]}" + f"A total of {len(fine_grained_lineages)} upstream column edges found for datasource {datasource[c.ID]}" ) return upstream_tables, fine_grained_lineages def get_upstream_datasources(self, datasource: dict) -> List[Upstream]: upstream_tables = [] - for ds in datasource.get(tableau_constant.UPSTREAM_DATA_SOURCES, []): - if ds[tableau_constant.ID] not in self.datasource_ids_being_used: - self.datasource_ids_being_used.append(ds[tableau_constant.ID]) + for ds in datasource.get(c.UPSTREAM_DATA_SOURCES, []): + if ds[c.ID] not in self.datasource_ids_being_used: + self.datasource_ids_being_used.append(ds[c.ID]) upstream_ds_urn = builder.make_dataset_urn_with_platform_instance( platform=self.platform, - name=ds[tableau_constant.ID], + name=ds[c.ID], platform_instance=self.config.platform_instance, env=self.config.env, ) @@ -943,20 +934,15 @@ def get_upstream_csql_tables( csql_id_to_urn = {} for field in fields: - if not field.get(tableau_constant.UPSTREAM_COLUMNS): + if not field.get(c.UPSTREAM_COLUMNS): continue - for upstream_col in field[tableau_constant.UPSTREAM_COLUMNS]: + for upstream_col in field[c.UPSTREAM_COLUMNS]: if ( upstream_col - and upstream_col.get(tableau_constant.TABLE) - and upstream_col.get(tableau_constant.TABLE)[ - tableau_constant.TYPE_NAME - ] - == tableau_constant.CUSTOM_SQL_TABLE + and upstream_col.get(c.TABLE) + and upstream_col.get(c.TABLE)[c.TYPE_NAME] == c.CUSTOM_SQL_TABLE ): - upstream_table_id = upstream_col.get(tableau_constant.TABLE)[ - tableau_constant.ID - ] + upstream_table_id = upstream_col.get(c.TABLE)[c.ID] csql_urn = builder.make_dataset_urn_with_platform_instance( platform=self.platform, @@ -986,18 +972,18 @@ def get_upstream_tables( for table in tables: # skip upstream tables when there is no column info when retrieving datasource # Lineage and Schema details for these will be taken care in self.emit_custom_sql_datasources() - num_tbl_cols: Optional[int] = table.get( - tableau_constant.COLUMNS_CONNECTION - ) and table[tableau_constant.COLUMNS_CONNECTION].get("totalCount") + num_tbl_cols: Optional[int] = table.get(c.COLUMNS_CONNECTION) and table[ + c.COLUMNS_CONNECTION + ].get("totalCount") if not is_custom_sql and not num_tbl_cols: logger.debug( - f"Skipping upstream table with id {table[tableau_constant.ID]}, no columns: {table}" + f"Skipping upstream table with id {table[c.ID]}, no columns: {table}" ) continue - elif table[tableau_constant.NAME] is None: + elif table[c.NAME] is None: self.report.num_upstream_table_skipped_no_name += 1 logger.warning( - f"Skipping upstream table {table[tableau_constant.ID]} from lineage since its name is none: {table}" + f"Skipping upstream table {table[c.ID]} from lineage since its name is none: {table}" ) continue @@ -1014,7 +1000,7 @@ def get_upstream_tables( self.config.platform_instance_map, self.config.lineage_overrides, ) - table_id_to_urn[table[tableau_constant.ID]] = table_urn + table_id_to_urn[table[c.ID]] = table_urn upstream_table = Upstream( dataset=table_urn, @@ -1029,13 +1015,13 @@ def get_upstream_tables( if table_urn not in self.database_tables: self.database_tables[table_urn] = DatabaseTable( urn=table_urn, - id=table[tableau_constant.ID], + id=table[c.ID], num_cols=num_tbl_cols, paths={table_path} if table_path else set(), ) else: self.database_tables[table_urn].update_table( - table[tableau_constant.ID], num_tbl_cols, table_path + table[c.ID], num_tbl_cols, table_path ) return upstream_tables, table_id_to_urn @@ -1047,24 +1033,24 @@ def get_upstream_columns_of_fields_in_datasource( table_id_to_urn: Dict[str, str], ) -> List[FineGrainedLineage]: fine_grained_lineages = [] - for field in datasource.get(tableau_constant.FIELDS) or []: - field_name = field.get(tableau_constant.NAME) + for field in datasource.get(c.FIELDS) or []: + field_name = field.get(c.NAME) # upstreamColumns lineage will be set via upstreamFields. # such as for CalculatedField if ( not field_name - or not field.get(tableau_constant.UPSTREAM_COLUMNS) - or field.get(tableau_constant.UPSTREAM_FIELDS) + or not field.get(c.UPSTREAM_COLUMNS) + or field.get(c.UPSTREAM_FIELDS) ): continue input_columns = [] - for upstream_col in field.get(tableau_constant.UPSTREAM_COLUMNS): + for upstream_col in field.get(c.UPSTREAM_COLUMNS): if not upstream_col: continue - name = upstream_col.get(tableau_constant.NAME) + name = upstream_col.get(c.NAME) upstream_table_id = ( - upstream_col.get(tableau_constant.TABLE)[tableau_constant.ID] - if upstream_col.get(tableau_constant.TABLE) + upstream_col.get(c.TABLE)[c.ID] + if upstream_col.get(c.TABLE) else None ) if ( @@ -1110,23 +1096,21 @@ def get_upstream_fields_of_field_in_datasource( self, datasource: dict, datasource_urn: str ) -> List[FineGrainedLineage]: fine_grained_lineages = [] - for field in datasource.get(tableau_constant.FIELDS) or []: - field_name = field.get(tableau_constant.NAME) + for field in datasource.get(c.FIELDS) or []: + field_name = field.get(c.NAME) # It is observed that upstreamFields gives one-hop field # lineage, and not multi-hop field lineage # This behavior is as desired in our case. - if not field_name or not field.get(tableau_constant.UPSTREAM_FIELDS): + if not field_name or not field.get(c.UPSTREAM_FIELDS): continue input_fields = [] - for upstream_field in field.get(tableau_constant.UPSTREAM_FIELDS): + for upstream_field in field.get(c.UPSTREAM_FIELDS): if not upstream_field: continue - name = upstream_field.get(tableau_constant.NAME) + name = upstream_field.get(c.NAME) upstream_ds_id = ( - upstream_field.get(tableau_constant.DATA_SOURCE)[ - tableau_constant.ID - ] - if upstream_field.get(tableau_constant.DATA_SOURCE) + upstream_field.get(c.DATA_SOURCE)[c.ID] + if upstream_field.get(c.DATA_SOURCE) else None ) if name and upstream_ds_id: @@ -1212,35 +1196,37 @@ def get_upstream_fields_from_custom_sql( return fine_grained_lineages def get_transform_operation(self, field: dict) -> str: - field_type = field[tableau_constant.TYPE_NAME] + field_type = field[c.TYPE_NAME] if field_type in ( - tableau_constant.DATA_SOURCE_FIELD, - tableau_constant.COLUMN_FIELD, + c.DATA_SOURCE_FIELD, + c.COLUMN_FIELD, ): - op = tableau_constant.IDENTITY # How to specify exact same - elif field_type == tableau_constant.CALCULATED_FIELD: + op = c.IDENTITY # How to specify exact same + elif field_type == c.CALCULATED_FIELD: op = field_type - if field.get(tableau_constant.FORMULA): - op += f"formula: {field.get(tableau_constant.FORMULA)}" + if field.get(c.FORMULA): + op += f"formula: {field.get(c.FORMULA)}" else: op = field_type # BinField, CombinedField, etc return op def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: - custom_sql_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.custom_sql_ids_being_used)}" + custom_sql_filter = ( + f"{c.ID_WITH_IN}: {json.dumps(self.custom_sql_ids_being_used)}" + ) custom_sql_connection = list( self.get_connection_objects( custom_sql_graphql_query, - tableau_constant.CUSTOM_SQL_TABLE_CONNECTION, + c.CUSTOM_SQL_TABLE_CONNECTION, custom_sql_filter, ) ) unique_custom_sql = get_unique_custom_sql(custom_sql_connection) for csql in unique_custom_sql: - csql_id: str = csql[tableau_constant.ID] + csql_id: str = csql[c.ID] csql_urn = builder.make_dataset_urn_with_platform_instance( platform=self.platform, name=csql_id, @@ -1256,40 +1242,33 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: datasource_name = None project = None - if len(csql[tableau_constant.DATA_SOURCES]) > 0: + if len(csql[c.DATA_SOURCES]) > 0: # CustomSQLTable id owned by exactly one tableau data source logger.debug( - f"Number of datasources referencing CustomSQLTable: {len(csql[tableau_constant.DATA_SOURCES])}" + f"Number of datasources referencing CustomSQLTable: {len(csql[c.DATA_SOURCES])}" ) - datasource = csql[tableau_constant.DATA_SOURCES][0] - datasource_name = datasource.get(tableau_constant.NAME) + datasource = csql[c.DATA_SOURCES][0] + datasource_name = datasource.get(c.NAME) if datasource.get( - tableau_constant.TYPE_NAME - ) == tableau_constant.EMBEDDED_DATA_SOURCE and datasource.get( - tableau_constant.WORKBOOK - ): + c.TYPE_NAME + ) == c.EMBEDDED_DATA_SOURCE and datasource.get(c.WORKBOOK): datasource_name = ( - f"{datasource.get(tableau_constant.WORKBOOK).get(tableau_constant.NAME)}/{datasource_name}" - if datasource_name - and datasource.get(tableau_constant.WORKBOOK).get( - tableau_constant.NAME - ) + f"{datasource.get(c.WORKBOOK).get(c.NAME)}/{datasource_name}" + if datasource_name and datasource.get(c.WORKBOOK).get(c.NAME) else None ) logger.debug( f"Adding datasource {datasource_name}({datasource.get('id')}) to container" ) yield from add_entity_to_container( - self.gen_workbook_key( - datasource[tableau_constant.WORKBOOK][tableau_constant.ID] - ), - tableau_constant.DATASET, + self.gen_workbook_key(datasource[c.WORKBOOK][c.ID]), + c.DATASET, dataset_snapshot.urn, ) project = self._get_project_browse_path_name(datasource) - tables = csql.get(tableau_constant.TABLES, []) + tables = csql.get(c.TABLES, []) if tables: # lineage from custom sql -> datasets/tables # @@ -1306,9 +1285,8 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: # Schema Metadata # if condition is needed as graphQL return "cloumns": None columns: List[Dict[Any, Any]] = ( - cast(List[Dict[Any, Any]], csql.get(tableau_constant.COLUMNS)) - if tableau_constant.COLUMNS in csql - and csql.get(tableau_constant.COLUMNS) is not None + cast(List[Dict[Any, Any]], csql.get(c.COLUMNS)) + if c.COLUMNS in csql and csql.get(c.COLUMNS) is not None else [] ) schema_metadata = self.get_schema_metadata_for_custom_sql(columns) @@ -1320,7 +1298,7 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: if project and datasource_name: browse_paths = BrowsePathsClass( paths=[ - f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource[tableau_constant.NAME]}" + f"/{self.config.env.lower()}/{self.platform}/{project}/{datasource[c.NAME]}" ] ) dataset_snapshot.aspects.append(browse_paths) @@ -1328,27 +1306,25 @@ def emit_custom_sql_datasources(self) -> Iterable[MetadataWorkUnit]: logger.debug(f"Browse path not set for Custom SQL table {csql_id}") dataset_properties = DatasetPropertiesClass( - name=csql.get(tableau_constant.NAME), - description=csql.get(tableau_constant.DESCRIPTION), + name=csql.get(c.NAME), + description=csql.get(c.DESCRIPTION), ) dataset_snapshot.aspects.append(dataset_properties) - if csql.get(tableau_constant.QUERY): + if csql.get(c.QUERY): view_properties = ViewPropertiesClass( materialized=False, - viewLanguage=tableau_constant.SQL, - viewLogic=clean_query(csql[tableau_constant.QUERY]), + viewLanguage=c.SQL, + viewLogic=clean_query(csql[c.QUERY]), ) dataset_snapshot.aspects.append(view_properties) yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, - aspect_name=tableau_constant.SUB_TYPES, - aspect=SubTypesClass( - typeNames=[DatasetSubTypes.VIEW, tableau_constant.CUSTOM_SQL] - ), + aspect_name=c.SUB_TYPES, + aspect=SubTypesClass(typeNames=[DatasetSubTypes.VIEW, c.CUSTOM_SQL]), ) def get_schema_metadata_for_custom_sql( @@ -1359,21 +1335,19 @@ def get_schema_metadata_for_custom_sql( for field in columns: # Datasource fields - if field.get(tableau_constant.NAME) is None: + if field.get(c.NAME) is None: self.report.num_csql_field_skipped_no_name += 1 logger.warning( - f"Skipping field {field[tableau_constant.ID]} from schema since its name is none" + f"Skipping field {field[c.ID]} from schema since its name is none" ) continue - nativeDataType = field.get( - tableau_constant.REMOTE_TYPE, tableau_constant.UNKNOWN - ) + nativeDataType = field.get(c.REMOTE_TYPE, c.UNKNOWN) TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( - fieldPath=field[tableau_constant.NAME], + fieldPath=field[c.NAME], type=SchemaFieldDataType(type=TypeClass()), nativeDataType=nativeDataType, - description=field.get(tableau_constant.DESCRIPTION), + description=field.get(c.DESCRIPTION), ) fields.append(schema_field) @@ -1391,28 +1365,25 @@ def _get_published_datasource_project_luid(self, ds: dict) -> Optional[str]: # This is fallback in case "get all datasources" query fails for some reason. # It is possible due to https://github.com/tableau/server-client-python/issues/1210 if ( - ds.get(tableau_constant.LUID) - and ds[tableau_constant.LUID] not in self.datasource_project_map.keys() + ds.get(c.LUID) + and ds[c.LUID] not in self.datasource_project_map.keys() and self.report.get_all_datasources_query_failed ): logger.debug( - f"published datasource {ds.get(tableau_constant.NAME)} project_luid not found." - f" Running get datasource query for {ds[tableau_constant.LUID]}" + f"published datasource {ds.get(c.NAME)} project_luid not found." + f" Running get datasource query for {ds[c.LUID]}" ) # Query and update self.datasource_project_map with luid - self._query_published_datasource_for_project_luid(ds[tableau_constant.LUID]) + self._query_published_datasource_for_project_luid(ds[c.LUID]) if ( - ds.get(tableau_constant.LUID) - and ds[tableau_constant.LUID] in self.datasource_project_map.keys() - and self.datasource_project_map[ds[tableau_constant.LUID]] - in self.tableau_project_registry + ds.get(c.LUID) + and ds[c.LUID] in self.datasource_project_map.keys() + and self.datasource_project_map[ds[c.LUID]] in self.tableau_project_registry ): - return self.datasource_project_map[ds[tableau_constant.LUID]] + return self.datasource_project_map[ds[c.LUID]] - logger.debug( - f"published datasource {ds.get(tableau_constant.NAME)} project_luid not found" - ) + logger.debug(f"published datasource {ds.get(c.NAME)} project_luid not found") return None @@ -1437,60 +1408,52 @@ def _query_published_datasource_for_project_luid(self, ds_luid: str) -> None: logger.debug("Error stack trace", exc_info=True) def _get_workbook_project_luid(self, wb: dict) -> Optional[str]: - if wb.get(tableau_constant.LUID) and self.workbook_project_map.get( - wb[tableau_constant.LUID] - ): - return self.workbook_project_map[wb[tableau_constant.LUID]] + if wb.get(c.LUID) and self.workbook_project_map.get(wb[c.LUID]): + return self.workbook_project_map[wb[c.LUID]] - logger.debug(f"workbook {wb.get(tableau_constant.NAME)} project_luid not found") + logger.debug(f"workbook {wb.get(c.NAME)} project_luid not found") return None def _get_embedded_datasource_project_luid(self, ds: dict) -> Optional[str]: - if ds.get(tableau_constant.WORKBOOK): + if ds.get(c.WORKBOOK): project_luid: Optional[str] = self._get_workbook_project_luid( - ds[tableau_constant.WORKBOOK] + ds[c.WORKBOOK] ) if project_luid and project_luid in self.tableau_project_registry: return project_luid - logger.debug( - f"embedded datasource {ds.get(tableau_constant.NAME)} project_luid not found" - ) + logger.debug(f"embedded datasource {ds.get(c.NAME)} project_luid not found") return None def _get_datasource_project_luid(self, ds: dict) -> Optional[str]: # Only published and embedded data-sources are supported - ds_type: Optional[str] = ds.get(tableau_constant.TYPE_NAME) + ds_type: Optional[str] = ds.get(c.TYPE_NAME) if ds_type not in ( - tableau_constant.PUBLISHED_DATA_SOURCE, - tableau_constant.EMBEDDED_DATA_SOURCE, + c.PUBLISHED_DATA_SOURCE, + c.EMBEDDED_DATA_SOURCE, ): logger.debug( - f"datasource {ds.get(tableau_constant.NAME)} type {ds.get(tableau_constant.TYPE_NAME)} is " + f"datasource {ds.get(c.NAME)} type {ds.get(c.TYPE_NAME)} is " f"unsupported" ) return None func_selector: Any = { - tableau_constant.PUBLISHED_DATA_SOURCE: self._get_published_datasource_project_luid, - tableau_constant.EMBEDDED_DATA_SOURCE: self._get_embedded_datasource_project_luid, + c.PUBLISHED_DATA_SOURCE: self._get_published_datasource_project_luid, + c.EMBEDDED_DATA_SOURCE: self._get_embedded_datasource_project_luid, } return func_selector[ds_type](ds) @staticmethod def _get_datasource_project_name(ds: dict) -> Optional[str]: - if ds.get( - tableau_constant.TYPE_NAME - ) == tableau_constant.EMBEDDED_DATA_SOURCE and ds.get( - tableau_constant.WORKBOOK - ): - return ds[tableau_constant.WORKBOOK].get(tableau_constant.PROJECT_NAME) - if ds.get(tableau_constant.TYPE_NAME) == tableau_constant.PUBLISHED_DATA_SOURCE: - return ds.get(tableau_constant.PROJECT_NAME) + if ds.get(c.TYPE_NAME) == c.EMBEDDED_DATA_SOURCE and ds.get(c.WORKBOOK): + return ds[c.WORKBOOK].get(c.PROJECT_NAME) + if ds.get(c.TYPE_NAME) == c.PUBLISHED_DATA_SOURCE: + return ds.get(c.PROJECT_NAME) return None def _get_project_browse_path_name(self, ds: dict) -> Optional[str]: @@ -1502,7 +1465,7 @@ def _get_project_browse_path_name(self, ds: dict) -> Optional[str]: project_luid = self._get_datasource_project_luid(ds) if project_luid is None: logger.warning( - f"Could not load project hierarchy for datasource {ds.get(tableau_constant.NAME)}. Please check permissions." + f"Could not load project hierarchy for datasource {ds.get(c.NAME)}. Please check permissions." ) logger.debug(f"datasource = {ds}") return None @@ -1515,7 +1478,7 @@ def _create_lineage_to_upstream_tables( # This adds an edge to upstream DatabaseTables using `upstreamTables` upstream_tables, _ = self.get_upstream_tables( tables, - datasource.get(tableau_constant.NAME) or "", + datasource.get(c.NAME) or "", self._get_project_browse_path_name(datasource), is_custom_sql=True, ) @@ -1524,7 +1487,7 @@ def _create_lineage_to_upstream_tables( upstream_lineage = UpstreamLineage(upstreams=upstream_tables) yield self.get_metadata_change_proposal( csql_urn, - aspect_name=tableau_constant.UPSTREAM_LINEAGE, + aspect_name=c.UPSTREAM_LINEAGE, aspect=upstream_lineage, ) @@ -1547,22 +1510,19 @@ def parse_custom_sql( ] ], ) -> Optional["SqlParsingResult"]: - database_info = datasource.get(tableau_constant.DATABASE) or {} + database_info = datasource.get(c.DATABASE) or {} - if datasource.get(tableau_constant.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False): + if datasource.get(c.IS_UNSUPPORTED_CUSTOM_SQL) in (None, False): logger.debug(f"datasource {datasource_urn} is not created from custom sql") return None - if ( - tableau_constant.NAME not in database_info - or tableau_constant.CONNECTION_TYPE not in database_info - ): + if c.NAME not in database_info or c.CONNECTION_TYPE not in database_info: logger.debug( f"database information is missing from datasource {datasource_urn}" ) return None - query = datasource.get(tableau_constant.QUERY) + query = datasource.get(c.QUERY) if query is None: logger.debug( f"raw sql query is not available for datasource {datasource_urn}" @@ -1571,13 +1531,13 @@ def parse_custom_sql( logger.debug(f"Parsing sql={query}") - upstream_db = database_info.get(tableau_constant.NAME) + upstream_db = database_info.get(c.NAME) if func_overridden_info is not None: # Override the information as per configuration upstream_db, platform_instance, platform, _ = func_overridden_info( - database_info[tableau_constant.CONNECTION_TYPE], - database_info.get(tableau_constant.NAME), + database_info[c.CONNECTION_TYPE], + database_info.get(c.NAME), self.config.platform_instance_map, self.config.lineage_overrides, ) @@ -1631,7 +1591,7 @@ def _create_lineage_from_unsupported_csql( yield self.get_metadata_change_proposal( csql_urn, - aspect_name=tableau_constant.UPSTREAM_LINEAGE, + aspect_name=c.UPSTREAM_LINEAGE, aspect=upstream_lineage, ) @@ -1642,10 +1602,10 @@ def _get_schema_metadata_for_datasource( for field in datasource_fields: # check datasource - custom sql relations from a field being referenced self._track_custom_sql_ids(field) - if field.get(tableau_constant.NAME) is None: + if field.get(c.NAME) is None: self.report.num_upstream_table_skipped_no_name += 1 logger.warning( - f"Skipping field {field[tableau_constant.ID]} from schema since its name is none" + f"Skipping field {field[c.ID]} from schema since its name is none" ) continue @@ -1678,7 +1638,7 @@ def get_metadata_change_proposal( aspect: Union["UpstreamLineage", "SubTypesClass"], ) -> MetadataWorkUnit: return MetadataChangeProposalWrapper( - entityType=tableau_constant.DATASET, + entityType=c.DATASET, changeType=ChangeTypeClass.UPSERT, entityUrn=urn, aspectName=aspect_name, @@ -1696,10 +1656,8 @@ def emit_datasource( datasource_info = datasource browse_path = self._get_project_browse_path_name(datasource) - logger.debug( - f"datasource {datasource.get(tableau_constant.NAME)} browse-path {browse_path}" - ) - datasource_id = datasource[tableau_constant.ID] + logger.debug(f"datasource {datasource.get(c.NAME)} browse-path {browse_path}") + datasource_id = datasource[c.ID] datasource_urn = builder.make_dataset_urn_with_platform_instance( self.platform, datasource_id, self.config.platform_instance, self.config.env ) @@ -1713,13 +1671,10 @@ def emit_datasource( # Browse path - if ( - browse_path - and is_embedded_ds - and workbook - and workbook.get(tableau_constant.NAME) - ): - browse_path = f"{browse_path}/{workbook[tableau_constant.NAME].replace('/', REPLACE_SLASH_CHAR)}" + if browse_path and is_embedded_ds and workbook and workbook.get(c.NAME): + browse_path = ( + f"{browse_path}/{workbook[c.NAME].replace('/', REPLACE_SLASH_CHAR)}" + ) if browse_path: browse_paths = BrowsePathsClass( @@ -1729,12 +1684,10 @@ def emit_datasource( # Ownership owner = ( - self._get_ownership( - datasource_info[tableau_constant.OWNER][tableau_constant.USERNAME] - ) + self._get_ownership(datasource_info[c.OWNER][c.USERNAME]) if datasource_info - and datasource_info.get(tableau_constant.OWNER) - and datasource_info[tableau_constant.OWNER].get(tableau_constant.USERNAME) + and datasource_info.get(c.OWNER) + and datasource_info[c.OWNER].get(c.USERNAME) else None ) if owner is not None: @@ -1742,24 +1695,22 @@ def emit_datasource( # Dataset properties dataset_props = DatasetPropertiesClass( - name=datasource.get(tableau_constant.NAME), - description=datasource.get(tableau_constant.DESCRIPTION), + name=datasource.get(c.NAME), + description=datasource.get(c.DESCRIPTION), customProperties=self.get_custom_props_from_dict( datasource, [ - tableau_constant.HAS_EXTRACTS, - tableau_constant.EXTRACT_LAST_REFRESH_TIME, - tableau_constant.EXTRACT_LAST_INCREMENTAL_UPDATE_TIME, - tableau_constant.EXTRACT_LAST_UPDATE_TIME, + c.HAS_EXTRACTS, + c.EXTRACT_LAST_REFRESH_TIME, + c.EXTRACT_LAST_INCREMENTAL_UPDATE_TIME, + c.EXTRACT_LAST_UPDATE_TIME, ], ), ) dataset_snapshot.aspects.append(dataset_props) # Upstream Tables - if datasource.get(tableau_constant.UPSTREAM_TABLES) or datasource.get( - tableau_constant.UPSTREAM_DATA_SOURCES - ): + if datasource.get(c.UPSTREAM_TABLES) or datasource.get(c.UPSTREAM_DATA_SOURCES): # datasource -> db table relations ( upstream_tables, @@ -1779,13 +1730,13 @@ def emit_datasource( ) yield self.get_metadata_change_proposal( datasource_urn, - aspect_name=tableau_constant.UPSTREAM_LINEAGE, + aspect_name=c.UPSTREAM_LINEAGE, aspect=upstream_lineage, ) # Datasource Fields schema_metadata = self._get_schema_metadata_for_datasource( - datasource.get(tableau_constant.FIELDS, []) + datasource.get(c.FIELDS, []) ) if schema_metadata is not None: dataset_snapshot.aspects.append(schema_metadata) @@ -1793,7 +1744,7 @@ def emit_datasource( yield self.get_metadata_change_event(dataset_snapshot) yield self.get_metadata_change_proposal( dataset_snapshot.urn, - aspect_name=tableau_constant.SUB_TYPES, + aspect_name=c.SUB_TYPES, aspect=SubTypesClass( typeNames=( ["Embedded Data Source"] @@ -1809,7 +1760,7 @@ def emit_datasource( if container_key is not None: yield from add_entity_to_container( container_key, - tableau_constant.DATASET, + c.DATASET, dataset_snapshot.urn, ) @@ -1822,10 +1773,10 @@ def _get_datasource_container_key( container_key: Optional[ContainerKey] = None if is_embedded_ds: # It is embedded then parent is container is workbook if workbook is not None: - container_key = self.gen_workbook_key(workbook[tableau_constant.ID]) + container_key = self.gen_workbook_key(workbook[c.ID]) else: logger.warning( - f"Parent container not set for embedded datasource {datasource[tableau_constant.ID]}" + f"Parent container not set for embedded datasource {datasource[c.ID]}" ) else: parent_project_luid = self._get_published_datasource_project_luid( @@ -1836,17 +1787,19 @@ def _get_datasource_container_key( container_key = self.gen_project_key(parent_project_luid) else: logger.warning( - f"Parent container not set for published datasource {datasource[tableau_constant.ID]}" + f"Parent container not set for published datasource {datasource[c.ID]}" ) return container_key def emit_published_datasources(self) -> Iterable[MetadataWorkUnit]: - datasource_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.datasource_ids_being_used)}" + datasource_filter = ( + f"{c.ID_WITH_IN}: {json.dumps(self.datasource_ids_being_used)}" + ) for datasource in self.get_connection_objects( published_datasource_graphql_query, - tableau_constant.PUBLISHED_DATA_SOURCES_CONNECTION, + c.PUBLISHED_DATA_SOURCES_CONNECTION, datasource_filter, ): yield from self.emit_datasource(datasource) @@ -1855,11 +1808,13 @@ def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: database_table_id_to_urn_map: Dict[str, str] = dict() for urn, tbl in self.database_tables.items(): database_table_id_to_urn_map[tbl.id] = urn - tables_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(list(database_table_id_to_urn_map.keys()))}" + tables_filter = ( + f"{c.ID_WITH_IN}: {json.dumps(list(database_table_id_to_urn_map.keys()))}" + ) for table in self.get_connection_objects( database_tables_graphql_query, - tableau_constant.DATABASE_TABLES_CONNECTION, + c.DATABASE_TABLES_CONNECTION, tables_filter, ): yield from self.emit_table(table, database_table_id_to_urn_map) @@ -1867,11 +1822,9 @@ def emit_upstream_tables(self) -> Iterable[MetadataWorkUnit]: def emit_table( self, table: dict, database_table_id_to_urn_map: Dict[str, str] ) -> Iterable[MetadataWorkUnit]: - database_table = self.database_tables[ - database_table_id_to_urn_map[table[tableau_constant.ID]] - ] - columns = table.get(tableau_constant.COLUMNS, []) - is_embedded = table.get(tableau_constant.IS_EMBEDDED) or False + database_table = self.database_tables[database_table_id_to_urn_map[table[c.ID]]] + columns = table.get(c.COLUMNS, []) + is_embedded = table.get(c.IS_EMBEDDED) or False if not is_embedded and not self.config.ingest_tables_external: logger.debug( f"Skipping external table {database_table.urn} as ingest_tables_external is set to False" @@ -1907,21 +1860,19 @@ def get_schema_metadata_for_table( if columns: fields = [] for field in columns: - if field.get(tableau_constant.NAME) is None: + if field.get(c.NAME) is None: self.report.num_table_field_skipped_no_name += 1 logger.warning( - f"Skipping field {field[tableau_constant.ID]} from schema since its name is none" + f"Skipping field {field[c.ID]} from schema since its name is none" ) continue - nativeDataType = field.get( - tableau_constant.REMOTE_TYPE, tableau_constant.UNKNOWN - ) + nativeDataType = field.get(c.REMOTE_TYPE, c.UNKNOWN) TypeClass = FIELD_TYPE_MAPPING.get(nativeDataType, NullTypeClass) schema_field = SchemaField( - fieldPath=field[tableau_constant.NAME], + fieldPath=field[c.NAME], type=SchemaFieldDataType(type=TypeClass()), - description=field.get(tableau_constant.DESCRIPTION), + description=field.get(c.DESCRIPTION), nativeDataType=nativeDataType, ) @@ -1941,11 +1892,9 @@ def get_schema_metadata_for_table( def get_sheetwise_upstream_datasources(self, sheet: dict) -> set: sheet_upstream_datasources = set() - for field in sheet.get(tableau_constant.DATA_SOURCE_FIELDS) or []: - if field and field.get(tableau_constant.DATA_SOURCE): - sheet_upstream_datasources.add( - field[tableau_constant.DATA_SOURCE][tableau_constant.ID] - ) + for field in sheet.get(c.DATA_SOURCE_FIELDS) or []: + if field and field.get(c.DATA_SOURCE): + sheet_upstream_datasources.add(field[c.DATA_SOURCE][c.ID]) return sheet_upstream_datasources @@ -1961,20 +1910,20 @@ def _create_datahub_chart_usage_stat( def _get_chart_stat_wu( self, sheet: dict, sheet_urn: str ) -> Optional[MetadataWorkUnit]: - luid: Optional[str] = sheet.get(tableau_constant.LUID) + luid: Optional[str] = sheet.get(c.LUID) if luid is None: logger.debug( "stat:luid is none for sheet %s(id:%s)", - sheet.get(tableau_constant.NAME), - sheet.get(tableau_constant.ID), + sheet.get(c.NAME), + sheet.get(c.ID), ) return None usage_stat: Optional[UsageStat] = self.tableau_stat_registry.get(luid) if usage_stat is None: logger.debug( "stat:UsageStat is not available in tableau_stat_registry for sheet %s(id:%s)", - sheet.get(tableau_constant.NAME), - sheet.get(tableau_constant.ID), + sheet.get(c.NAME), + sheet.get(c.ID), ) return None @@ -1983,8 +1932,8 @@ def _get_chart_stat_wu( ) logger.debug( "stat: Chart usage stat work unit is created for %s(id:%s)", - sheet.get(tableau_constant.NAME), - sheet.get(tableau_constant.ID), + sheet.get(c.NAME), + sheet.get(c.ID), ) return MetadataChangeProposalWrapper( aspect=aspect, @@ -1992,22 +1941,20 @@ def _get_chart_stat_wu( ).as_workunit() def emit_sheets(self) -> Iterable[MetadataWorkUnit]: - sheets_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.sheet_ids)}" + sheets_filter = f"{c.ID_WITH_IN}: {json.dumps(self.sheet_ids)}" for sheet in self.get_connection_objects( sheet_graphql_query, - tableau_constant.SHEETS_CONNECTION, + c.SHEETS_CONNECTION, sheets_filter, ): - yield from self.emit_sheets_as_charts( - sheet, sheet.get(tableau_constant.WORKBOOK) - ) + yield from self.emit_sheets_as_charts(sheet, sheet.get(c.WORKBOOK)) def emit_sheets_as_charts( self, sheet: dict, workbook: Optional[Dict] ) -> Iterable[MetadataWorkUnit]: sheet_urn: str = builder.make_chart_urn( - self.platform, sheet[tableau_constant.ID], self.config.platform_instance + self.platform, sheet[c.ID], self.config.platform_instance ) chart_snapshot = ChartSnapshot( urn=sheet_urn, @@ -2015,34 +1962,32 @@ def emit_sheets_as_charts( ) creator: Optional[str] = None - if workbook is not None and workbook.get(tableau_constant.OWNER) is not None: - creator = workbook[tableau_constant.OWNER].get(tableau_constant.USERNAME) - created_at = sheet.get(tableau_constant.CREATED_AT, datetime.now()) - updated_at = sheet.get(tableau_constant.UPDATED_AT, datetime.now()) + if workbook is not None and workbook.get(c.OWNER) is not None: + creator = workbook[c.OWNER].get(c.USERNAME) + created_at = sheet.get(c.CREATED_AT, datetime.now()) + updated_at = sheet.get(c.UPDATED_AT, datetime.now()) last_modified = self.get_last_modified(creator, created_at, updated_at) - if sheet.get(tableau_constant.PATH): + if sheet.get(c.PATH): site_part = f"/site/{self.config.site}" if self.config.site else "" - sheet_external_url = f"{self.config.connect_uri}/#{site_part}/views/{sheet.get(tableau_constant.PATH)}" - elif ( - sheet.get(tableau_constant.CONTAINED_IN_DASHBOARDS) is not None - and len(sheet[tableau_constant.CONTAINED_IN_DASHBOARDS]) > 0 - and sheet[tableau_constant.CONTAINED_IN_DASHBOARDS][0] is not None - and sheet[tableau_constant.CONTAINED_IN_DASHBOARDS][0].get( - tableau_constant.PATH + sheet_external_url = ( + f"{self.config.connect_uri}/#{site_part}/views/{sheet.get(c.PATH)}" ) + elif ( + sheet.get(c.CONTAINED_IN_DASHBOARDS) is not None + and len(sheet[c.CONTAINED_IN_DASHBOARDS]) > 0 + and sheet[c.CONTAINED_IN_DASHBOARDS][0] is not None + and sheet[c.CONTAINED_IN_DASHBOARDS][0].get(c.PATH) ): # sheet contained in dashboard site_part = f"/t/{self.config.site}" if self.config.site else "" - dashboard_path = sheet[tableau_constant.CONTAINED_IN_DASHBOARDS][0][ - tableau_constant.PATH - ] - sheet_external_url = f"{self.config.connect_uri}{site_part}/authoring/{dashboard_path}/{sheet.get(tableau_constant.NAME, '')}" + dashboard_path = sheet[c.CONTAINED_IN_DASHBOARDS][0][c.PATH] + sheet_external_url = f"{self.config.connect_uri}{site_part}/authoring/{dashboard_path}/{sheet.get(c.NAME, '')}" else: # hidden or viz-in-tooltip sheet sheet_external_url = None input_fields: List[InputField] = [] - if sheet.get(tableau_constant.DATA_SOURCE_FIELDS): + if sheet.get(c.DATA_SOURCE_FIELDS): self.populate_sheet_upstream_fields(sheet, input_fields) # datasource urn @@ -2060,15 +2005,13 @@ def emit_sheets_as_charts( # Chart Info chart_info = ChartInfoClass( description="", - title=sheet.get(tableau_constant.NAME) or "", + title=sheet.get(c.NAME) or "", lastModified=last_modified, externalUrl=sheet_external_url if self.config.ingest_external_links_for_charts else None, inputs=sorted(datasource_urn), - customProperties=self.get_custom_props_from_dict( - sheet, [tableau_constant.LUID] - ), + customProperties=self.get_custom_props_from_dict(sheet, [c.LUID]), ) chart_snapshot.aspects.append(chart_info) # chart_snapshot doesn't support the stat aspect as list element and hence need to emit MCP @@ -2083,7 +2026,7 @@ def emit_sheets_as_charts( chart_snapshot.aspects.append(browse_paths) else: logger.warning( - f"Could not set browse path for workbook {sheet[tableau_constant.ID]}. Please check permissions." + f"Could not set browse path for workbook {sheet[c.ID]}. Please check permissions." ) # Ownership @@ -2107,9 +2050,7 @@ def emit_sheets_as_charts( ) if workbook is not None: yield from add_entity_to_container( - self.gen_workbook_key(workbook[tableau_constant.ID]), - tableau_constant.CHART, - chart_snapshot.urn, + self.gen_workbook_key(workbook[c.ID]), c.CHART, chart_snapshot.urn ) if input_fields: @@ -2134,14 +2075,12 @@ def _get_project_path(self, project: TableauProject) -> str: def populate_sheet_upstream_fields( self, sheet: dict, input_fields: List[InputField] ) -> None: - for field in sheet.get(tableau_constant.DATA_SOURCE_FIELDS): # type: ignore + for field in sheet.get(c.DATA_SOURCE_FIELDS): # type: ignore if not field: continue - name = field.get(tableau_constant.NAME) + name = field.get(c.NAME) upstream_ds_id = ( - field.get(tableau_constant.DATA_SOURCE)[tableau_constant.ID] - if field.get(tableau_constant.DATA_SOURCE) - else None + field.get(c.DATA_SOURCE)[c.ID] if field.get(c.DATA_SOURCE) else None ) if name and upstream_ds_id: input_fields.append( @@ -2162,10 +2101,8 @@ def populate_sheet_upstream_fields( ) def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUnit]: - workbook_container_key = self.gen_workbook_key(workbook[tableau_constant.ID]) - creator = workbook.get(tableau_constant.OWNER, {}).get( - tableau_constant.USERNAME - ) + workbook_container_key = self.gen_workbook_key(workbook[c.ID]) + creator = workbook.get(c.OWNER, {}).get(c.USERNAME) owner_urn = ( builder.make_user_urn(creator) @@ -2191,17 +2128,17 @@ def emit_workbook_as_container(self, workbook: Dict) -> Iterable[MetadataWorkUni if project_luid and project_luid in self.tableau_project_registry.keys(): parent_key = self.gen_project_key(project_luid) else: - workbook_id: Optional[str] = workbook.get(tableau_constant.ID) - workbook_name: Optional[str] = workbook.get(tableau_constant.NAME) + workbook_id: Optional[str] = workbook.get(c.ID) + workbook_name: Optional[str] = workbook.get(c.NAME) logger.warning( f"Could not load project hierarchy for workbook {workbook_name}({workbook_id}). Please check permissions." ) yield from gen_containers( container_key=workbook_container_key, - name=workbook.get(tableau_constant.NAME) or "", + name=workbook.get(c.NAME) or "", parent_container_key=parent_key, - description=workbook.get(tableau_constant.DESCRIPTION), + description=workbook.get(c.DESCRIPTION), sub_types=[BIContainerSubTypes.TABLEAU_WORKBOOK], owner_urn=owner_urn, external_url=workbook_external_url, @@ -2237,20 +2174,20 @@ def _create_datahub_dashboard_usage_stat( def _get_dashboard_stat_wu( self, dashboard: dict, dashboard_urn: str ) -> Optional[MetadataWorkUnit]: - luid: Optional[str] = dashboard.get(tableau_constant.LUID) + luid: Optional[str] = dashboard.get(c.LUID) if luid is None: logger.debug( "stat:luid is none for dashboard %s(id:%s)", - dashboard.get(tableau_constant.NAME), - dashboard.get(tableau_constant.ID), + dashboard.get(c.NAME), + dashboard.get(c.ID), ) return None usage_stat: Optional[UsageStat] = self.tableau_stat_registry.get(luid) if usage_stat is None: logger.debug( "stat:UsageStat is not available in tableau_stat_registry for dashboard %s(id:%s)", - dashboard.get(tableau_constant.NAME), - dashboard.get(tableau_constant.ID), + dashboard.get(c.NAME), + dashboard.get(c.ID), ) return None @@ -2259,8 +2196,8 @@ def _get_dashboard_stat_wu( ) logger.debug( "stat: Dashboard usage stat is created for %s(id:%s)", - dashboard.get(tableau_constant.NAME), - dashboard.get(tableau_constant.ID), + dashboard.get(c.NAME), + dashboard.get(c.ID), ) return MetadataChangeProposalWrapper( @@ -2288,26 +2225,20 @@ def new_work_unit(self, mcp: MetadataChangeProposalWrapper) -> MetadataWorkUnit: ) def emit_dashboards(self) -> Iterable[MetadataWorkUnit]: - dashboards_filter = ( - f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.dashboard_ids)}" - ) + dashboards_filter = f"{c.ID_WITH_IN}: {json.dumps(self.dashboard_ids)}" for dashboard in self.get_connection_objects( dashboard_graphql_query, - tableau_constant.DASHBOARDS_CONNECTION, + c.DASHBOARDS_CONNECTION, dashboards_filter, ): - yield from self.emit_dashboard( - dashboard, dashboard.get(tableau_constant.WORKBOOK) - ) + yield from self.emit_dashboard(dashboard, dashboard.get(c.WORKBOOK)) def get_tags(self, obj: dict) -> Optional[List[str]]: - tag_list = obj.get(tableau_constant.TAGS, []) + tag_list = obj.get(c.TAGS, []) if tag_list and self.config.ingest_tags: tag_list_str = [ - t[tableau_constant.NAME] - for t in tag_list - if t is not None and t.get(tableau_constant.NAME) + t[c.NAME] for t in tag_list if t is not None and t.get(c.NAME) ] return tag_list_str @@ -2317,7 +2248,7 @@ def emit_dashboard( self, dashboard: dict, workbook: Optional[Dict] ) -> Iterable[MetadataWorkUnit]: dashboard_urn: str = builder.make_dashboard_urn( - self.platform, dashboard[tableau_constant.ID], self.config.platform_instance + self.platform, dashboard[c.ID], self.config.platform_instance ) dashboard_snapshot = DashboardSnapshot( urn=dashboard_urn, @@ -2325,26 +2256,28 @@ def emit_dashboard( ) creator: Optional[str] = None - if workbook is not None and workbook.get(tableau_constant.OWNER) is not None: - creator = workbook[tableau_constant.OWNER].get(tableau_constant.USERNAME) - created_at = dashboard.get(tableau_constant.CREATED_AT, datetime.now()) - updated_at = dashboard.get(tableau_constant.UPDATED_AT, datetime.now()) + if workbook is not None and workbook.get(c.OWNER) is not None: + creator = workbook[c.OWNER].get(c.USERNAME) + created_at = dashboard.get(c.CREATED_AT, datetime.now()) + updated_at = dashboard.get(c.UPDATED_AT, datetime.now()) last_modified = self.get_last_modified(creator, created_at, updated_at) site_part = f"/site/{self.config.site}" if self.config.site else "" - dashboard_external_url = f"{self.config.connect_uri}/#{site_part}/views/{dashboard.get(tableau_constant.PATH, '')}" + dashboard_external_url = ( + f"{self.config.connect_uri}/#{site_part}/views/{dashboard.get(c.PATH, '')}" + ) title = ( - dashboard[tableau_constant.NAME].replace("/", REPLACE_SLASH_CHAR) - if dashboard.get(tableau_constant.NAME) + dashboard[c.NAME].replace("/", REPLACE_SLASH_CHAR) + if dashboard.get(c.NAME) else "" ) chart_urns = [ builder.make_chart_urn( self.platform, - sheet.get(tableau_constant.ID), + sheet.get(c.ID), self.config.platform_instance, ) - for sheet in dashboard.get(tableau_constant.SHEETS, []) + for sheet in dashboard.get(c.SHEETS, []) ] dashboard_info_class = DashboardInfoClass( description="", @@ -2354,9 +2287,7 @@ def emit_dashboard( dashboardUrl=dashboard_external_url if self.config.ingest_external_links_for_dashboards else None, - customProperties=self.get_custom_props_from_dict( - dashboard, [tableau_constant.LUID] - ), + customProperties=self.get_custom_props_from_dict(dashboard, [c.LUID]), ) dashboard_snapshot.aspects.append(dashboard_info_class) @@ -2377,7 +2308,7 @@ def emit_dashboard( dashboard_snapshot.aspects.append(browse_paths) else: logger.warning( - f"Could not set browse path for dashboard {dashboard[tableau_constant.ID]}. Please check permissions." + f"Could not set browse path for dashboard {dashboard[c.ID]}. Please check permissions." ) # Ownership @@ -2397,8 +2328,8 @@ def emit_dashboard( if workbook is not None: yield from add_entity_to_container( - self.gen_workbook_key(workbook[tableau_constant.ID]), - tableau_constant.DASHBOARD, + self.gen_workbook_key(workbook[c.ID]), + c.DASHBOARD, dashboard_snapshot.urn, ) @@ -2406,38 +2337,40 @@ def get_browse_paths_aspect( self, workbook: Optional[Dict] ) -> Optional[BrowsePathsClass]: browse_paths: Optional[BrowsePathsClass] = None - if workbook and workbook.get(tableau_constant.NAME): + if workbook and workbook.get(c.NAME): project_luid: Optional[str] = self._get_workbook_project_luid(workbook) if project_luid in self.tableau_project_registry: browse_paths = BrowsePathsClass( paths=[ f"/{self.platform}/{self._project_luid_to_browse_path_name(project_luid)}" - f"/{workbook[tableau_constant.NAME].replace('/', REPLACE_SLASH_CHAR)}" + f"/{workbook[c.NAME].replace('/', REPLACE_SLASH_CHAR)}" ] ) - elif workbook.get(tableau_constant.PROJECT_NAME): + elif workbook.get(c.PROJECT_NAME): # browse path browse_paths = BrowsePathsClass( paths=[ - f"/{self.platform}/{workbook[tableau_constant.PROJECT_NAME].replace('/', REPLACE_SLASH_CHAR)}" - f"/{workbook[tableau_constant.NAME].replace('/', REPLACE_SLASH_CHAR)}" + f"/{self.platform}/{workbook[c.PROJECT_NAME].replace('/', REPLACE_SLASH_CHAR)}" + f"/{workbook[c.NAME].replace('/', REPLACE_SLASH_CHAR)}" ] ) return browse_paths def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]: - datasource_filter = f"{tableau_constant.ID_WITH_IN}: {json.dumps(self.embedded_datasource_ids_being_used)}" + datasource_filter = ( + f"{c.ID_WITH_IN}: {json.dumps(self.embedded_datasource_ids_being_used)}" + ) for datasource in self.get_connection_objects( embedded_datasource_graphql_query, - tableau_constant.EMBEDDED_DATA_SOURCES_CONNECTION, + c.EMBEDDED_DATA_SOURCES_CONNECTION, datasource_filter, ): yield from self.emit_datasource( datasource, - datasource.get(tableau_constant.WORKBOOK), + datasource.get(c.WORKBOOK), is_embedded_ds=True, ) @@ -2483,7 +2416,7 @@ def emit_project_containers(self) -> Iterable[MetadataWorkUnit]: container_key=self.gen_project_key(_id), name=project.name, description=project.description, - sub_types=[tableau_constant.PROJECT], + sub_types=[c.PROJECT], parent_container_key=self.gen_project_key(project.parent_id) if project.parent_id else None, @@ -2498,7 +2431,7 @@ def emit_project_containers(self) -> Iterable[MetadataWorkUnit]: yield from gen_containers( container_key=self.gen_project_key(project.parent_id), name=cast(str, project.parent_name), - sub_types=[tableau_constant.PROJECT], + sub_types=[c.PROJECT], ) def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py index 7c4852042ce7c8..65d779b7f4516d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py @@ -8,7 +8,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel -from datahub.ingestion.source import tableau_constant as tc +from datahub.ingestion.source import tableau_constant as c from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageType, FineGrainedLineage, @@ -591,12 +591,12 @@ def create( cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None ) -> "TableauUpstreamReference": # Values directly from `table` object from Tableau - database = t_database = d.get(tc.DATABASE, {}).get(tc.NAME) - schema = t_schema = d.get(tc.SCHEMA) - table = t_table = d.get(tc.NAME) or "" - t_full_name = d.get(tc.FULL_NAME) - t_connection_type = d[tc.CONNECTION_TYPE] # required to generate urn - t_id = d[tc.ID] + database = t_database = d.get(c.DATABASE, {}).get(c.NAME) + schema = t_schema = d.get(c.SCHEMA) + table = t_table = d.get(c.NAME) or "" + t_full_name = d.get(c.FULL_NAME) + t_connection_type = d[c.CONNECTION_TYPE] # required to generate urn + t_id = d[c.ID] parsed_full_name = cls.parse_full_name(t_full_name) if parsed_full_name and len(parsed_full_name) == 3: diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index c3e8c175f1de54..9fc697018ecd6b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -12,7 +12,7 @@ OAUTH_AUTHENTICATOR, ) -from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.common import AllowDenyPattern, ConfigModel from datahub.configuration.oauth import OAuthConfiguration, OAuthIdentityProvider from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -42,9 +42,14 @@ SNOWFLAKE_HOST_SUFFIX = ".snowflakecomputing.com" -class BaseSnowflakeConfig(BaseTimeWindowConfig): +class BaseSnowflakeConfig(ConfigModel): # Note: this config model is also used by the snowflake-usage source. + options: dict = pydantic.Field( + default_factory=dict, + description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", + ) + scheme: str = "snowflake" username: Optional[str] = pydantic.Field( default=None, description="Snowflake username." @@ -82,14 +87,6 @@ class BaseSnowflakeConfig(BaseTimeWindowConfig): default=None, description="Snowflake warehouse." ) role: Optional[str] = pydantic.Field(default=None, description="Snowflake role.") - include_table_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", - ) - include_view_lineage: bool = pydantic.Field( - default=True, - description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", - ) connect_args: Optional[Dict[str, Any]] = pydantic.Field( default=None, description="Connect args to pass to Snowflake SqlAlchemy driver", @@ -166,18 +163,6 @@ def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None: "but should be set when using use_certificate false for oauth_config" ) - @pydantic.root_validator() - def validate_include_view_lineage(cls, values): - if ( - "include_table_lineage" in values - and not values.get("include_table_lineage") - and values.get("include_view_lineage") - ): - raise ValueError( - "include_table_lineage must be True for include_view_lineage to be set." - ) - return values - def get_sql_alchemy_url( self, database: Optional[str] = None, @@ -261,28 +246,8 @@ def get_connect_args(self) -> dict: self._computed_connect_args = connect_args return connect_args - -class SnowflakeConfig(BaseSnowflakeConfig, SQLCommonConfig): - database_pattern: AllowDenyPattern = AllowDenyPattern( - deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] - ) - - ignore_start_time_lineage: bool = False - upstream_lineage_in_report: bool = False - - def get_sql_alchemy_url( - self, - database: Optional[str] = None, - username: Optional[str] = None, - password: Optional[pydantic.SecretStr] = None, - role: Optional[str] = None, - ) -> str: - return super().get_sql_alchemy_url( - database=database, username=username, password=password, role=role - ) - def get_options(self) -> dict: - options_connect_args: Dict = super().get_connect_args() + options_connect_args: Dict = self.get_connect_args() options_connect_args.update(self.options.get("connect_args", {})) self.options["connect_args"] = options_connect_args return self.options @@ -372,3 +337,34 @@ def get_connection(self) -> snowflake.connector.SnowflakeConnection: else: # not expected to be here raise Exception("Not expected to be here.") + + +class SnowflakeConfig(BaseSnowflakeConfig, BaseTimeWindowConfig, SQLCommonConfig): + + include_table_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role and Snowflake Enterprise Edition or above.", + ) + include_view_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake view->table and table->view lineages. Requires appropriate grants given to the role, and include_table_lineage to be True. view->table lineage requires Snowflake Enterprise Edition or above.", + ) + + database_pattern: AllowDenyPattern = AllowDenyPattern( + deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] + ) + + ignore_start_time_lineage: bool = False + upstream_lineage_in_report: bool = False + + @pydantic.root_validator() + def validate_include_view_lineage(cls, values): + if ( + "include_table_lineage" in values + and not values.get("include_table_lineage") + and values.get("include_view_lineage") + ): + raise ValueError( + "include_table_lineage must be True for include_view_lineage to be set." + ) + return values diff --git a/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py b/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py deleted file mode 100644 index 85ad0fb4938eb8..00000000000000 --- a/metadata-ingestion/src/datahub/utilities/memory_leak_detector.py +++ /dev/null @@ -1,106 +0,0 @@ -import fnmatch -import gc -import logging -import sys -import tracemalloc -from collections import defaultdict -from functools import wraps -from typing import Any, Callable, Dict, List, TypeVar, Union, cast - -import click -from typing_extensions import Concatenate, ParamSpec - -logger = logging.getLogger(__name__) -T = TypeVar("T") -P = ParamSpec("P") - - -def _trace_has_file(trace: tracemalloc.Traceback, file_pattern: str) -> bool: - for frame_index in range(len(trace)): - cur_frame = trace[frame_index] - if fnmatch.fnmatch(cur_frame.filename, file_pattern): - return True - return False - - -def _init_leak_detection() -> None: - # Initialize trace malloc to track up to 25 stack frames. - tracemalloc.start(25) - if sys.version_info >= (3, 9): - # Nice to reset peak to 0. Available for versions >= 3.9. - tracemalloc.reset_peak() - # Enable leak debugging in the garbage collector. - gc.set_debug(gc.DEBUG_LEAK) - - -def _perform_leak_detection() -> None: - # Log potentially useful memory usage metrics - logger.info(f"GC count before collect {gc.get_count()}") - traced_memory_size, traced_memory_peak = tracemalloc.get_traced_memory() - logger.info(f"Traced Memory: size={traced_memory_size}, peak={traced_memory_peak}") - num_unreacheable_objects = gc.collect() - logger.info(f"Number of unreachable objects = {num_unreacheable_objects}") - logger.info(f"GC count after collect {gc.get_count()}") - - # Collect unique traces of all live objects in the garbage - these have potential leaks. - unique_traces_to_objects: Dict[ - Union[tracemalloc.Traceback, int], List[object] - ] = defaultdict(list) - for obj in gc.garbage: - obj_trace = tracemalloc.get_object_traceback(obj) - if obj_trace is not None: - if _trace_has_file(obj_trace, "*datahub/*.py"): - # Leaking object - unique_traces_to_objects[obj_trace].append(obj) - else: - unique_traces_to_objects[id(obj)].append(obj) - logger.info("Potentially leaking objects start") - for key, obj_list in sorted( - unique_traces_to_objects.items(), - key=lambda item: sum( - [sys.getsizeof(o) for o in item[1]] - ), # TODO: add support for deep sizeof - reverse=True, - ): - if isinstance(key, tracemalloc.Traceback): - obj_traceback: tracemalloc.Traceback = cast(tracemalloc.Traceback, key) - logger.info( - f"#Objects:{len(obj_list)}; Total memory:{sum([sys.getsizeof(obj) for obj in obj_list])};" - + " Allocation Trace:\n\t" - + "\n\t".join(obj_traceback.format(limit=25)) - ) - else: - logger.info( - f"#Objects:{len(obj_list)}; Total memory:{sum([sys.getsizeof(obj) for obj in obj_list])};" - + " No Allocation Trace available!" - ) - logger.info("Potentially leaking objects end") - - tracemalloc.stop() - - -def with_leak_detection( - func: Callable[Concatenate[click.Context, P], T] -) -> Callable[Concatenate[click.Context, P], T]: - @wraps(func) - def wrapper(ctx: click.Context, *args: P.args, **kwargs: P.kwargs) -> Any: - detect_leaks: bool = ctx.obj.get("detect_memory_leaks", False) - if detect_leaks: - logger.info( - f"Initializing memory leak detection on command: {func.__module__}.{func.__name__}" - ) - _init_leak_detection() - - try: - return func(ctx, *args, **kwargs) - finally: - if detect_leaks: - logger.info( - f"Starting memory leak detection on command: {func.__module__}.{func.__name__}" - ) - _perform_leak_detection() - logger.info( - f"Finished memory leak detection on command: {func.__module__}.{func.__name__}" - ) - - return wrapper diff --git a/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json new file mode 100644 index 00000000000000..a72c960a722969 --- /dev/null +++ b/metadata-ingestion/tests/integration/fivetran/fivetran_golden.json @@ -0,0 +1,658 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": {}, + "name": "postgres" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "paused": "False", + "sync_frequency": "1440", + "destination_id": "'interval_unconstitutional'" + }, + "name": "postgres", + "type": { + "string": "COMMAND" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD),name)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:Shubham Jagtap", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": {}, + "name": "4c9a03d6-eded-4422-a46a-163266e58243", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1695191853000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1695191853000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:ee88d32dbe3133a23a9023c097050190", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1695191885000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": {}, + "name": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1696343730000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343730000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:be36f55c13ec4e313c7510770e50784a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343732000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SKIPPED", + "nativeResultType": "fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": {}, + "name": "63c2fc85-600b-455f-9ba0-f576522465be", + "type": "BATCH_SCHEDULED", + "created": { + "time": 1696343755000, + "actor": "urn:li:corpuser:datahub" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "upstreamInstances": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.employee,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,postgres_db.public.company,DEV)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.employee,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_database.postgres_public.company,PROD)" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343755000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:d8f100271d2dc3fa905717f82d083c8d", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696343790000, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "FAILURE", + "nativeResultType": "fivetran" + } + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(fivetran,calendar_elected,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(fivetran,calendar_elected,PROD),calendar_elected)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "powerbi-test", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/fivetran/test_fivetran.py b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py new file mode 100644 index 00000000000000..62b3df12e1b9d3 --- /dev/null +++ b/metadata-ingestion/tests/integration/fivetran/test_fivetran.py @@ -0,0 +1,192 @@ +import datetime +from unittest import mock +from unittest.mock import MagicMock + +import pytest +from freezegun import freeze_time + +from datahub.ingestion.run.pipeline import Pipeline +from datahub.ingestion.source.fivetran.config import DestinationConfig +from datahub.ingestion.source.fivetran.fivetran_query import FivetranLogQuery +from tests.test_helpers import mce_helpers + +FROZEN_TIME = "2022-06-07 17:00:00" + + +def default_query_results(query): + if query == FivetranLogQuery.use_schema("TEST_DATABASE", "TEST_SCHEMA"): + return [] + elif query == FivetranLogQuery.get_connectors_query(): + return [ + { + "connector_id": "calendar_elected", + "connecting_user_id": "reapply_phone", + "connector_type_id": "postgres", + "connector_name": "postgres", + "paused": False, + "sync_frequency": 1440, + "destination_id": "interval_unconstitutional", + }, + ] + elif query == FivetranLogQuery.get_table_lineage_query("calendar_elected"): + return [ + { + "source_table_id": "10040", + "source_table_name": "employee", + "source_schema_name": "public", + "destination_table_id": "7779", + "destination_table_name": "employee", + "destination_schema_name": "postgres_public", + }, + { + "source_table_id": "10041", + "source_table_name": "company", + "source_schema_name": "public", + "destination_table_id": "7780", + "destination_table_name": "company", + "destination_schema_name": "postgres_public", + }, + ] + elif query == FivetranLogQuery.get_column_lineage_query( + "10040", "7779" + ) or query == FivetranLogQuery.get_column_lineage_query("10041", "7780"): + return [ + { + "source_column_name": "id", + "destination_column_name": "id", + }, + { + "source_column_name": "name", + "destination_column_name": "name", + }, + ] + elif query == FivetranLogQuery.get_user_query("reapply_phone"): + return [ + { + "user_id": "reapply_phone", + "given_name": "Shubham", + "family_name": "Jagtap", + } + ] + elif query == FivetranLogQuery.get_sync_start_logs_query("calendar_elected"): + return [ + { + "time_stamp": datetime.datetime(2023, 9, 20, 6, 37, 32, 606000), + "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", + }, + { + "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 30, 345000), + "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", + }, + { + "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 55, 401000), + "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", + }, + ] + elif query == FivetranLogQuery.get_sync_end_logs_query("calendar_elected"): + return [ + { + "time_stamp": datetime.datetime(2023, 9, 20, 6, 38, 5, 56000), + "sync_id": "4c9a03d6-eded-4422-a46a-163266e58243", + "message_data": '"{\\"status\\":\\"SUCCESSFUL\\"}"', + }, + { + "time_stamp": datetime.datetime(2023, 10, 3, 14, 35, 31, 512000), + "sync_id": "f773d1e9-c791-48f4-894f-8cf9b3dfc834", + "message_data": '"{\\"reason\\":\\"Sync has been cancelled because of a user action in the dashboard.Standard Config updated.\\",\\"status\\":\\"CANCELED\\"}"', + }, + { + "time_stamp": datetime.datetime(2023, 10, 3, 14, 36, 29, 678000), + "sync_id": "63c2fc85-600b-455f-9ba0-f576522465be", + "message_data": '"{\\"reason\\":\\"java.lang.RuntimeException: FATAL: too many connections for role \\\\\\"hxwraqld\\\\\\"\\",\\"taskType\\":\\"reconnect\\",\\"status\\":\\"FAILURE_WITH_TASK\\"}"', + }, + ] + # Unreachable code + raise Exception(f"Unknown query {query}") + + +@freeze_time(FROZEN_TIME) +@pytest.mark.integration +def test_fivetran_basic(pytestconfig, tmp_path): + test_resources_dir = pytestconfig.rootpath / "tests/integration/fivetran" + + # Run the metadata ingestion pipeline. + output_file = tmp_path / "fivetran_test_events.json" + golden_file = test_resources_dir / "fivetran_golden.json" + + with mock.patch( + "datahub.ingestion.source.fivetran.fivetran_log_api.create_engine" + ) as mock_create_engine: + connection_magic_mock = MagicMock() + connection_magic_mock.execute.side_effect = default_query_results + + mock_create_engine.return_value = connection_magic_mock + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "fivetran", + "config": { + "fivetran_log_config": { + "destination_platform": "snowflake", + "destination_config": { + "account_id": "TESTID", + "warehouse": "TEST_WH", + "username": "test", + "password": "test@123", + "database": "TEST_DATABASE", + "role": "TESTROLE", + "log_schema": "TEST_SCHEMA", + }, + }, + "connector_patterns": { + "allow": [ + "postgres", + ] + }, + "sources_to_database": { + "calendar_elected": "postgres_db", + }, + "sources_to_platform_instance": { + "calendar_elected": { + "env": "DEV", + } + }, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{output_file}", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "fivetran_golden.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=f"{output_file}", + golden_path=f"{test_resources_dir}/{golden_file}", + ) + + +@freeze_time(FROZEN_TIME) +def test_fivetran_snowflake_destination_config(pytestconfig, tmp_path): + snowflake_dest = DestinationConfig( + account_id="TESTID", + warehouse="TEST_WH", + username="test", + password="test@123", + database="TEST_DATABASE", + role="TESTROLE", + log_schema="TEST_SCHEMA", + ) + assert ( + snowflake_dest.get_sql_alchemy_url() + == "snowflake://test:test%40123@TESTID?application=acryl_datahub&authenticator=SNOWFLAKE&role=TESTROLE&warehouse=TEST_WH" + ) diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index ff448eca01071f..78e54996973119 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -565,5 +565,4 @@ def default_query_results( # noqa: C901 "DOMAIN": "DATABASE", }, ] - # Unreachable code - raise Exception(f"Unknown query {query}") + raise ValueError(f"Unexpected query: {query}") diff --git a/metadata-ingestion/tests/unit/test_datahub_source.py b/metadata-ingestion/tests/unit/test_datahub_source.py new file mode 100644 index 00000000000000..adc131362b326b --- /dev/null +++ b/metadata-ingestion/tests/unit/test_datahub_source.py @@ -0,0 +1,51 @@ +from dataclasses import dataclass + +import pytest + +from datahub.ingestion.source.datahub.datahub_database_reader import ( + VersionOrderable, + VersionOrderer, +) + + +@dataclass +class MockRow(VersionOrderable): + createdon: int + version: int + urn: str + + +@pytest.fixture +def rows(): + return [ + MockRow(0, 0, "one"), + MockRow(0, 1, "one"), + MockRow(0, 0, "two"), + MockRow(0, 0, "three"), + MockRow(0, 1, "three"), + MockRow(0, 2, "three"), + MockRow(0, 1, "two"), + MockRow(0, 4, "three"), + MockRow(0, 5, "three"), + MockRow(1, 6, "three"), + MockRow(1, 0, "four"), + MockRow(2, 0, "five"), + MockRow(2, 1, "six"), + MockRow(2, 0, "six"), + MockRow(3, 0, "seven"), + MockRow(3, 0, "eight"), + ] + + +def test_version_orderer(rows): + orderer = VersionOrderer[MockRow](enabled=True) + ordered_rows = list(orderer(rows)) + assert ordered_rows == sorted( + ordered_rows, key=lambda x: (x.createdon, x.version == 0) + ) + + +def test_version_orderer_disabled(rows): + orderer = VersionOrderer[MockRow](enabled=False) + ordered_rows = list(orderer(rows)) + assert ordered_rows == rows diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 888a7c04415542..aaff878b81eeef 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -368,8 +368,7 @@ def default_query_results(query): return [('{"roles":"","value":""}',)] elif query == "select current_warehouse()": return [("TEST_WAREHOUSE")] - # Unreachable code - raise Exception() + raise ValueError(f"Unexpected query: {query}") connection_mock = MagicMock() cursor_mock = MagicMock() @@ -397,8 +396,7 @@ def query_results(query): ] elif query == 'show grants to role "PUBLIC"': return [] - # Unreachable code - raise Exception() + raise ValueError(f"Unexpected query: {query}") config = { "username": "user", @@ -441,8 +439,7 @@ def query_results(query): return [("", "USAGE", "DATABASE", "DB1")] elif query == 'show grants to role "PUBLIC"': return [] - # Unreachable code - raise Exception() + raise ValueError(f"Unexpected query: {query}") setup_mock_connect(mock_connect, query_results) @@ -485,8 +482,7 @@ def query_results(query): ] elif query == 'show grants to role "PUBLIC"': return [] - # Unreachable code - raise Exception() + raise ValueError(f"Unexpected query: {query}") setup_mock_connect(mock_connect, query_results) @@ -536,8 +532,7 @@ def query_results(query): ["", "USAGE", "VIEW", "SNOWFLAKE.ACCOUNT_USAGE.ACCESS_HISTORY"], ["", "USAGE", "VIEW", "SNOWFLAKE.ACCOUNT_USAGE.OBJECT_DEPENDENCIES"], ] - # Unreachable code - raise Exception() + raise ValueError(f"Unexpected query: {query}") setup_mock_connect(mock_connect, query_results) diff --git a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java index a69c6008fea474..dff9a22de8efd8 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/client/JavaEntityClient.java @@ -381,7 +381,7 @@ public SearchResult searchAcrossEntities( @Nonnull @Override public ScrollResult scrollAcrossEntities(@Nonnull List entities, @Nonnull String input, - @Nullable Filter filter, @Nullable String scrollId, @Nonnull String keepAlive, int count, + @Nullable Filter filter, @Nullable String scrollId, @Nullable String keepAlive, int count, @Nullable SearchFlags searchFlags, @Nonnull Authentication authentication) throws RemoteInvocationException { final SearchFlags finalFlags = searchFlags != null ? searchFlags : new SearchFlags().setFulltext(true); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java index 94b8d57efcc160..c99e4a94feb291 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/SearchService.java @@ -147,15 +147,23 @@ public SearchResult searchAcrossEntities(@Nonnull List entities, @Nonnul return result; } + /** + * If no entities are provided, fallback to the list of non-empty entities + * @param inputEntities the requested entities + * @return some entities to search + */ private List getEntitiesToSearch(@Nonnull List inputEntities) { List nonEmptyEntities; List lowercaseEntities = inputEntities.stream().map(String::toLowerCase).collect(Collectors.toList()); - try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getNonEmptyEntities").time()) { - nonEmptyEntities = _entityDocCountCache.getNonEmptyEntities(); - } - if (!inputEntities.isEmpty()) { - nonEmptyEntities = nonEmptyEntities.stream().filter(lowercaseEntities::contains).collect(Collectors.toList()); + + if (lowercaseEntities.isEmpty()) { + try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "getNonEmptyEntities").time()) { + nonEmptyEntities = _entityDocCountCache.getNonEmptyEntities(); + } + } else { + nonEmptyEntities = lowercaseEntities; } + return nonEmptyEntities; } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java index ceaf37a1289d99..db414d70603dc7 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java @@ -16,7 +16,7 @@ import javax.annotation.Nonnull; import javax.annotation.Nullable; import lombok.RequiredArgsConstructor; -import org.javatuples.Quintet; +import org.javatuples.Septet; import org.javatuples.Sextet; import org.springframework.cache.Cache; import org.springframework.cache.CacheManager; @@ -154,8 +154,9 @@ public SearchResult getCachedSearchResults( batchSize, querySize -> getRawSearchResults(entityNames, query, filters, sortCriterion, querySize.getFrom(), querySize.getSize(), flags, facets), - querySize -> Sextet.with(entityNames, query, filters != null ? toJsonString(filters) : null, - sortCriterion != null ? toJsonString(sortCriterion) : null, facets, querySize), flags, enableCache).getSearchResults(from, size); + querySize -> Septet.with(entityNames, query, filters != null ? toJsonString(filters) : null, + sortCriterion != null ? toJsonString(sortCriterion) : null, flags != null ? toJsonString(flags) : null, + facets, querySize), flags, enableCache).getSearchResults(from, size); } @@ -175,7 +176,8 @@ public AutoCompleteResult getCachedAutoCompleteResults( if (enableCache(flags)) { try (Timer.Context ignored2 = MetricUtils.timer(this.getClass(), "getCachedAutoCompleteResults_cache").time()) { Timer.Context cacheAccess = MetricUtils.timer(this.getClass(), "autocomplete_cache_access").time(); - Object cacheKey = Quintet.with(entityName, input, field, filters != null ? toJsonString(filters) : null, limit); + Object cacheKey = Sextet.with(entityName, input, field, filters != null ? toJsonString(filters) : null, + flags != null ? toJsonString(flags) : null, limit); String json = cache.get(cacheKey, String.class); result = json != null ? toRecordTemplate(AutoCompleteResult.class, json) : null; cacheAccess.stop(); @@ -210,7 +212,8 @@ public BrowseResult getCachedBrowseResults( if (enableCache(flags)) { try (Timer.Context ignored2 = MetricUtils.timer(this.getClass(), "getCachedBrowseResults_cache").time()) { Timer.Context cacheAccess = MetricUtils.timer(this.getClass(), "browse_cache_access").time(); - Object cacheKey = Quintet.with(entityName, path, filters != null ? toJsonString(filters) : null, from, size); + Object cacheKey = Sextet.with(entityName, path, filters != null ? toJsonString(filters) : null, + flags != null ? toJsonString(flags) : null, from, size); String json = cache.get(cacheKey, String.class); result = json != null ? toRecordTemplate(BrowseResult.class, json) : null; cacheAccess.stop(); @@ -247,9 +250,10 @@ public ScrollResult getCachedScrollResults( ScrollResult result; if (enableCache(flags)) { Timer.Context cacheAccess = MetricUtils.timer(this.getClass(), "scroll_cache_access").time(); - Object cacheKey = Sextet.with(entities, query, + Object cacheKey = Septet.with(entities, query, filters != null ? toJsonString(filters) : null, sortCriterion != null ? toJsonString(sortCriterion) : null, + flags != null ? toJsonString(flags) : null, scrollId, size); String json = cache.get(cacheKey, String.class); result = json != null ? toRecordTemplate(ScrollResult.class, json) : null; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java index cbaf70ca22617d..290e8c60deb000 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/ESSearchDAO.java @@ -157,7 +157,7 @@ private AggregationMetadataArray transformIndexIntoEntityName(AggregationMetadat @Nonnull @WithSpan private ScrollResult executeAndExtract(@Nonnull List entitySpecs, @Nonnull SearchRequest searchRequest, @Nullable Filter filter, - @Nullable String scrollId, @Nonnull String keepAlive, int size) { + @Nullable String scrollId, @Nullable String keepAlive, int size) { try (Timer.Context ignored = MetricUtils.timer(this.getClass(), "executeAndExtract_scroll").time()) { final SearchResponse searchResponse = client.search(searchRequest, RequestOptions.DEFAULT); // extract results, validated against document model as well @@ -166,7 +166,7 @@ private ScrollResult executeAndExtract(@Nonnull List entitySpecs, @N .extractScrollResult(searchResponse, filter, scrollId, keepAlive, size, supportsPointInTime())); } catch (Exception e) { - log.error("Search query failed", e); + log.error("Search query failed: {}", searchRequest, e); throw new ESQueryException("Search query failed:", e); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index 49571a60d5f211..0df6afd49c3735 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -241,7 +241,9 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi BoolQueryBuilder filterQuery = getFilterQuery(filter); searchSourceBuilder.query(QueryBuilders.boolQuery().must(getQuery(input, finalSearchFlags.isFulltext())).filter(filterQuery)); - _aggregationQueryBuilder.getAggregations().forEach(searchSourceBuilder::aggregation); + if (!finalSearchFlags.isSkipAggregates()) { + _aggregationQueryBuilder.getAggregations().forEach(searchSourceBuilder::aggregation); + } if (!finalSearchFlags.isSkipHighlighting()) { searchSourceBuilder.highlighter(_highlights); } @@ -366,7 +368,7 @@ public SearchResult extractResult(@Nonnull SearchResponse searchResponse, Filter @WithSpan public ScrollResult extractScrollResult(@Nonnull SearchResponse searchResponse, Filter filter, @Nullable String scrollId, - @Nonnull String keepAlive, int size, boolean supportsPointInTime) { + @Nullable String keepAlive, int size, boolean supportsPointInTime) { int totalCount = (int) searchResponse.getHits().getTotalHits().value; List resultList = getResults(searchResponse); SearchResultMetadata searchResultMetadata = extractSearchResultMetadata(searchResponse, filter); @@ -376,7 +378,7 @@ public ScrollResult extractScrollResult(@Nonnull SearchResponse searchResponse, if (searchHits.length == size) { Object[] sort = searchHits[searchHits.length - 1].getSortValues(); long expirationTimeMs = 0L; - if (supportsPointInTime) { + if (keepAlive != null && supportsPointInTime) { expirationTimeMs = TimeValue.parseTimeValue(keepAlive, "expirationTime").getMillis() + System.currentTimeMillis(); } nextScrollId = new SearchAfterWrapper(sort, searchResponse.pointInTimeId(), expirationTimeMs).toScrollId(); diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl index ded84e1969153b..935f3e5976dfa5 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl @@ -9,24 +9,7 @@ record AssertionResult { */ @TimeseriesField = {} @Searchable = {} - type: enum AssertionResultType { - /** - * The Assertion has not yet been fully evaluated - */ - INIT - /** - * The Assertion Succeeded - */ - SUCCESS - /** - * The Assertion Failed - */ - FAILURE - /** - * The Assertion encountered an Error - */ - ERROR - } + type: AssertionResultType /** * Number of rows for evaluated batch diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl new file mode 100644 index 00000000000000..8954d94cced7bf --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultType.pdl @@ -0,0 +1,23 @@ +namespace com.linkedin.assertion + +/** +* The final result of evaluating an assertion, e.g. SUCCESS, FAILURE, or ERROR. +*/ +enum AssertionResultType { + /** + * The Assertion has not yet been fully evaluated + */ + INIT + /** + * The Assertion Succeeded + */ + SUCCESS + /** + * The Assertion Failed + */ + FAILURE + /** + * The Assertion encountered an Error + */ + ERROR +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl index 14f12042327404..55bcae77273dbd 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl @@ -39,12 +39,7 @@ record AssertionRunEvent { * The status of the assertion run as per this timeseries event. */ @TimeseriesField = {} - status: enum AssertionRunStatus { - /** - * The Assertion Run has completed - */ - COMPLETE - } + status: AssertionRunStatus /** * Results of assertion, present if the status is COMPLETE diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl new file mode 100644 index 00000000000000..e4e17925ede82a --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunStatus.pdl @@ -0,0 +1,12 @@ +namespace com.linkedin.assertion + + +/** +* The lifecycle status of an assertion run. +*/ +enum AssertionRunStatus { + /** + * The Assertion Run has completed + */ + COMPLETE +} \ No newline at end of file diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 11d0f74305d7be..a5296d074093be 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -120,6 +120,7 @@ entities: - globalTags - glossaryTerms - browsePathsV2 + - subTypes - name: dashboard keyAspect: dashboardKey aspects: diff --git a/metadata-service/auth-filter/src/main/java/com/datahub/auth/authentication/filter/AuthenticationFilter.java b/metadata-service/auth-filter/src/main/java/com/datahub/auth/authentication/filter/AuthenticationFilter.java index e15918a8131580..8c7b3ac8b98f04 100644 --- a/metadata-service/auth-filter/src/main/java/com/datahub/auth/authentication/filter/AuthenticationFilter.java +++ b/metadata-service/auth-filter/src/main/java/com/datahub/auth/authentication/filter/AuthenticationFilter.java @@ -2,6 +2,7 @@ import com.datahub.authentication.authenticator.AuthenticatorChain; import com.datahub.authentication.authenticator.DataHubSystemAuthenticator; +import com.datahub.authentication.authenticator.HealthStatusAuthenticator; import com.datahub.authentication.authenticator.NoOpAuthenticator; import com.datahub.authentication.token.StatefulTokenService; import com.datahub.plugins.PluginConstant; @@ -29,6 +30,7 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -148,7 +150,7 @@ private void buildAuthenticatorChain() { } private AuthenticationRequest buildAuthContext(HttpServletRequest request) { - return new AuthenticationRequest(Collections.list(request.getHeaderNames()) + return new AuthenticationRequest(request.getServletPath(), request.getPathInfo(), Collections.list(request.getHeaderNames()) .stream() .collect(Collectors.toMap(headerName -> headerName, request::getHeader))); } @@ -242,7 +244,14 @@ private void registerNativeAuthenticator(AuthenticatorChain authenticatorChain, final Authenticator authenticator = clazz.newInstance(); // Successfully created authenticator. Now init and register it. log.debug(String.format("Initializing Authenticator with name %s", type)); - authenticator.init(configs, authenticatorContext); + if (authenticator instanceof HealthStatusAuthenticator) { + Map authenticatorConfig = new HashMap<>(Map.of(SYSTEM_CLIENT_ID_CONFIG, + this.configurationProvider.getAuthentication().getSystemClientId())); + authenticatorConfig.putAll(Optional.ofNullable(internalAuthenticatorConfig.getConfigs()).orElse(Collections.emptyMap())); + authenticator.init(authenticatorConfig, authenticatorContext); + } else { + authenticator.init(configs, authenticatorContext); + } log.info(String.format("Registering Authenticator with name %s", type)); authenticatorChain.register(authenticator); } catch (Exception e) { diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authentication/authenticator/HealthStatusAuthenticator.java b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/authenticator/HealthStatusAuthenticator.java new file mode 100644 index 00000000000000..5749eacf5d25d6 --- /dev/null +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authentication/authenticator/HealthStatusAuthenticator.java @@ -0,0 +1,55 @@ +package com.datahub.authentication.authenticator; + +import com.datahub.authentication.Actor; +import com.datahub.authentication.ActorType; +import com.datahub.authentication.Authentication; +import com.datahub.authentication.AuthenticationException; +import com.datahub.authentication.AuthenticationRequest; +import com.datahub.authentication.AuthenticatorContext; +import com.datahub.plugins.auth.authentication.Authenticator; +import lombok.extern.slf4j.Slf4j; + +import javax.annotation.Nonnull; +import javax.annotation.Nullable; +import java.util.Collections; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +import static com.datahub.authentication.AuthenticationConstants.SYSTEM_CLIENT_ID_CONFIG; + + +/** + * This Authenticator is used for allowing access for unauthenticated health check endpoints + * + * It exists to support load balancers, liveness/readiness checks + * + */ +@Slf4j +public class HealthStatusAuthenticator implements Authenticator { + private static final Set HEALTH_ENDPOINTS = Set.of( + "/openapi/check/", + "/openapi/up/" + ); + private String systemClientId; + + @Override + public void init(@Nonnull final Map config, @Nullable final AuthenticatorContext context) { + Objects.requireNonNull(config, "Config parameter cannot be null"); + this.systemClientId = Objects.requireNonNull((String) config.get(SYSTEM_CLIENT_ID_CONFIG), + String.format("Missing required config %s", SYSTEM_CLIENT_ID_CONFIG)); + } + + @Override + public Authentication authenticate(@Nonnull AuthenticationRequest context) throws AuthenticationException { + Objects.requireNonNull(context); + if (HEALTH_ENDPOINTS.stream().anyMatch(prefix -> String.join("", context.getServletInfo(), context.getPathInfo()).startsWith(prefix))) { + return new Authentication( + new Actor(ActorType.USER, systemClientId), + "", + Collections.emptyMap() + ); + } + throw new AuthenticationException("Authorization not allowed. Non-health check endpoint."); + } +} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java index f8eca541e1efb4..7e7a1de176f06d 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java @@ -126,11 +126,16 @@ private AuthorizedActors mergeAuthorizedActors(@Nullable AuthorizedActors origin mergedGroups = new ArrayList<>(groups); } + Set roles = new HashSet<>(original.getRoles()); + roles.addAll(other.getRoles()); + List mergedRoles = new ArrayList<>(roles); + return AuthorizedActors.builder() .allUsers(original.isAllUsers() || other.isAllUsers()) .allGroups(original.isAllGroups() || other.isAllGroups()) .users(mergedUsers) .groups(mergedGroups) + .roles(mergedRoles) .build(); } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java index e30fb93109915a..956d635c7901ac 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java @@ -19,6 +19,7 @@ import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; import javax.annotation.Nonnull; @@ -55,7 +56,8 @@ public enum AuthorizationMode { // Maps privilege name to the associated set of policies for fast access. // Not concurrent data structure because writes are always against the entire thing. private final Map> _policyCache = new HashMap<>(); // Shared Policy Cache. - private final ReadWriteLock _lockPolicyCache = new ReentrantReadWriteLock(); + private final ReadWriteLock readWriteLock = new ReentrantReadWriteLock(); + private final Lock readLock = readWriteLock.readLock(); private final ScheduledExecutorService _refreshExecutorService = Executors.newScheduledThreadPool(1); private final PolicyRefreshRunnable _policyRefreshRunnable; @@ -70,11 +72,13 @@ public DataHubAuthorizer( final EntityClient entityClient, final int delayIntervalSeconds, final int refreshIntervalSeconds, - final AuthorizationMode mode) { + final AuthorizationMode mode, + final int policyFetchSize) { _systemAuthentication = Objects.requireNonNull(systemAuthentication); _mode = Objects.requireNonNull(mode); _policyEngine = new PolicyEngine(systemAuthentication, Objects.requireNonNull(entityClient)); - _policyRefreshRunnable = new PolicyRefreshRunnable(systemAuthentication, new PolicyFetcher(entityClient), _policyCache, _lockPolicyCache); + _policyRefreshRunnable = new PolicyRefreshRunnable(systemAuthentication, new PolicyFetcher(entityClient), _policyCache, + readWriteLock.writeLock(), policyFetchSize); _refreshExecutorService.scheduleAtFixedRate(_policyRefreshRunnable, delayIntervalSeconds, refreshIntervalSeconds, TimeUnit.SECONDS); } @@ -93,41 +97,30 @@ public AuthorizationResult authorize(@Nonnull final AuthorizationRequest request Optional resolvedResourceSpec = request.getResourceSpec().map(_entitySpecResolver::resolve); - _lockPolicyCache.readLock().lock(); - try { - // 1. Fetch the policies relevant to the requested privilege. - final List policiesToEvaluate = _policyCache.getOrDefault(request.getPrivilege(), new ArrayList<>()); - - // 2. Evaluate each policy. - for (DataHubPolicyInfo policy : policiesToEvaluate) { - if (isRequestGranted(policy, request, resolvedResourceSpec)) { - // Short circuit if policy has granted privileges to this actor. - return new AuthorizationResult(request, AuthorizationResult.Type.ALLOW, - String.format("Granted by policy with type: %s", policy.getType())); - } + // 1. Fetch the policies relevant to the requested privilege. + final List policiesToEvaluate = getOrDefault(request.getPrivilege(), new ArrayList<>()); + + // 2. Evaluate each policy. + for (DataHubPolicyInfo policy : policiesToEvaluate) { + if (isRequestGranted(policy, request, resolvedResourceSpec)) { + // Short circuit if policy has granted privileges to this actor. + return new AuthorizationResult(request, AuthorizationResult.Type.ALLOW, + String.format("Granted by policy with type: %s", policy.getType())); } - return new AuthorizationResult(request, AuthorizationResult.Type.DENY, null); - } finally { - _lockPolicyCache.readLock().unlock(); } + return new AuthorizationResult(request, AuthorizationResult.Type.DENY, null); } public List getGrantedPrivileges(final String actor, final Optional resourceSpec) { + // 1. Fetch all policies + final List policiesToEvaluate = getOrDefault(ALL, new ArrayList<>()); - _lockPolicyCache.readLock().lock(); - try { - // 1. Fetch all policies - final List policiesToEvaluate = _policyCache.getOrDefault(ALL, new ArrayList<>()); - - Urn actorUrn = UrnUtils.getUrn(actor); - final ResolvedEntitySpec resolvedActorSpec = _entitySpecResolver.resolve(new EntitySpec(actorUrn.getEntityType(), actor)); + Urn actorUrn = UrnUtils.getUrn(actor); + final ResolvedEntitySpec resolvedActorSpec = _entitySpecResolver.resolve(new EntitySpec(actorUrn.getEntityType(), actor)); - Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve); + Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve); - return _policyEngine.getGrantedPrivileges(policiesToEvaluate, resolvedActorSpec, resolvedResourceSpec); - } finally { - _lockPolicyCache.readLock().unlock(); - } + return _policyEngine.getGrantedPrivileges(policiesToEvaluate, resolvedActorSpec, resolvedResourceSpec); } /** @@ -140,41 +133,38 @@ public AuthorizedActors authorizedActors( final List authorizedUsers = new ArrayList<>(); final List authorizedGroups = new ArrayList<>(); + final List authorizedRoles = new ArrayList<>(); boolean allUsers = false; boolean allGroups = false; - _lockPolicyCache.readLock().lock(); - try { - // Step 1: Find policies granting the privilege. - final List policiesToEvaluate = _policyCache.getOrDefault(privilege, new ArrayList<>()); - - Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve); + // Step 1: Find policies granting the privilege. + final List policiesToEvaluate = getOrDefault(privilege, new ArrayList<>()); + Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve); - // Step 2: For each policy, determine whether the resource is a match. - for (DataHubPolicyInfo policy : policiesToEvaluate) { - if (!PoliciesConfig.ACTIVE_POLICY_STATE.equals(policy.getState())) { - // Policy is not active, skip. - continue; - } + // Step 2: For each policy, determine whether the resource is a match. + for (DataHubPolicyInfo policy : policiesToEvaluate) { + if (!PoliciesConfig.ACTIVE_POLICY_STATE.equals(policy.getState())) { + // Policy is not active, skip. + continue; + } - final PolicyEngine.PolicyActors matchingActors = _policyEngine.getMatchingActors(policy, resolvedResourceSpec); + final PolicyEngine.PolicyActors matchingActors = _policyEngine.getMatchingActors(policy, resolvedResourceSpec); - // Step 3: For each matching policy, add actors that are authorized. - authorizedUsers.addAll(matchingActors.getUsers()); - authorizedGroups.addAll(matchingActors.getGroups()); - if (matchingActors.allUsers()) { - allUsers = true; - } - if (matchingActors.allGroups()) { - allGroups = true; - } + // Step 3: For each matching policy, add actors that are authorized. + authorizedUsers.addAll(matchingActors.getUsers()); + authorizedGroups.addAll(matchingActors.getGroups()); + authorizedRoles.addAll(matchingActors.getRoles()); + if (matchingActors.getAllUsers()) { + allUsers = true; + } + if (matchingActors.getAllGroups()) { + allGroups = true; } - } finally { - _lockPolicyCache.readLock().unlock(); } + // Step 4: Return all authorized users and groups. - return new AuthorizedActors(privilege, authorizedUsers, authorizedGroups, allUsers, allGroups); + return new AuthorizedActors(privilege, authorizedUsers, authorizedGroups, authorizedRoles, allUsers, allGroups); } /** @@ -234,6 +224,16 @@ private Optional getUrnFromRequestActor(String actor) { } } + private List getOrDefault(String key, List defaultValue) { + readLock.lock(); + try { + return _policyCache.getOrDefault(key, defaultValue); + } finally { + // To unlock the acquired read thread + readLock.unlock(); + } + } + /** * A {@link Runnable} used to periodically fetch a new instance of the policies Cache. * @@ -247,40 +247,42 @@ static class PolicyRefreshRunnable implements Runnable { private final Authentication _systemAuthentication; private final PolicyFetcher _policyFetcher; private final Map> _policyCache; - private final ReadWriteLock _lockPolicyCache; + private final Lock writeLock; + private final int count; @Override public void run() { try { // Populate new cache and swap. Map> newCache = new HashMap<>(); + Integer total = null; + String scrollId = null; - int start = 0; - int count = 30; - int total = 30; - - while (start < total) { + while (total == null || scrollId != null) { try { final PolicyFetcher.PolicyFetchResult - policyFetchResult = _policyFetcher.fetchPolicies(start, count, _systemAuthentication); + policyFetchResult = _policyFetcher.fetchPolicies(count, scrollId, _systemAuthentication); addPoliciesToCache(newCache, policyFetchResult.getPolicies()); total = policyFetchResult.getTotal(); - start = start + count; + scrollId = policyFetchResult.getScrollId(); } catch (Exception e) { log.error( - "Failed to retrieve policy urns! Skipping updating policy cache until next refresh. start: {}, count: {}", start, count, e); + "Failed to retrieve policy urns! Skipping updating policy cache until next refresh. count: {}, scrollId: {}", count, scrollId, e); return; } } - _lockPolicyCache.writeLock().lock(); + + writeLock.lock(); try { _policyCache.clear(); _policyCache.putAll(newCache); } finally { - _lockPolicyCache.writeLock().unlock(); + // To unlock the acquired write thread + writeLock.unlock(); } + log.debug(String.format("Successfully fetched %s policies.", total)); } catch (Exception e) { log.error("Caught exception while loading Policy cache. Will retry on next scheduled attempt.", e); diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java index f8c017ea74e1f6..da0ae26f2b1da6 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java @@ -32,7 +32,10 @@ import java.util.stream.Stream; import javax.annotation.Nullable; +import lombok.AccessLevel; +import lombok.AllArgsConstructor; import lombok.RequiredArgsConstructor; +import lombok.Value; import lombok.extern.slf4j.Slf4j; import static com.linkedin.metadata.Constants.*; @@ -75,6 +78,7 @@ public PolicyActors getMatchingActors( final Optional resource) { final List users = new ArrayList<>(); final List groups = new ArrayList<>(); + final List roles = new ArrayList<>(); boolean allUsers = false; boolean allGroups = false; if (policyMatchesResource(policy, resource)) { @@ -96,6 +100,9 @@ public PolicyActors getMatchingActors( if (actorFilter.getGroups() != null) { groups.addAll(actorFilter.getGroups()); } + if (actorFilter.getRoles() != null) { + roles.addAll(actorFilter.getRoles()); + } // 2. Fetch Actors based on resource ownership. if (actorFilter.isResourceOwners() && resource.isPresent()) { @@ -104,7 +111,7 @@ public PolicyActors getMatchingActors( groups.addAll(groupOwners(owners)); } } - return new PolicyActors(users, groups, allUsers, allGroups); + return new PolicyActors(users, groups, roles, allUsers, allGroups); } private boolean isPolicyApplicable( @@ -438,34 +445,14 @@ public boolean isGranted() { /** * Class used to represent all valid users of a policy. */ + @Value + @AllArgsConstructor(access = AccessLevel.PUBLIC) public static class PolicyActors { - final List _users; - final List _groups; - final Boolean _allUsers; - final Boolean _allGroups; - - public PolicyActors(final List users, final List groups, final Boolean allUsers, final Boolean allGroups) { - _users = users; - _groups = groups; - _allUsers = allUsers; - _allGroups = allGroups; - } - - public List getUsers() { - return _users; - } - - public List getGroups() { - return _groups; - } - - public Boolean allUsers() { - return _allUsers; - } - - public Boolean allGroups() { - return _allGroups; - } + List users; + List groups; + List roles; + Boolean allUsers; + Boolean allGroups; } private List userOwners(final Set owners) { diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java index 92d12bad41c9f5..c06da4d245f917 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyFetcher.java @@ -8,8 +8,8 @@ import com.linkedin.metadata.query.SearchFlags; import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.query.filter.SortOrder; +import com.linkedin.metadata.search.ScrollResult; import com.linkedin.metadata.search.SearchEntity; -import com.linkedin.metadata.search.SearchResult; import com.linkedin.policy.DataHubPolicyInfo; import com.linkedin.r2.RemoteInvocationException; import java.net.URISyntaxException; @@ -18,11 +18,14 @@ import java.util.List; import java.util.Map; import java.util.Objects; +import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; import lombok.RequiredArgsConstructor; import lombok.Value; import lombok.extern.slf4j.Slf4j; +import javax.annotation.Nullable; + import static com.linkedin.metadata.Constants.DATAHUB_POLICY_INFO_ASPECT_NAME; import static com.linkedin.metadata.Constants.POLICY_ENTITY_NAME; @@ -38,22 +41,53 @@ public class PolicyFetcher { private static final SortCriterion POLICY_SORT_CRITERION = new SortCriterion().setField("lastUpdatedTimestamp").setOrder(SortOrder.DESCENDING); - public PolicyFetchResult fetchPolicies(int start, int count, Authentication authentication) - throws RemoteInvocationException, URISyntaxException { - return fetchPolicies(start, count, "", authentication); + /** + * This is to provide a scroll implementation using the start/count api. It is not efficient + * and the scroll native functions should be used instead. This does fix a failure to fetch + * policies when deep pagination happens where there are >10k policies. + * Exists primarily to prevent breaking change to the graphql api. + */ + @Deprecated + public CompletableFuture fetchPolicies(int start, String query, int count, Authentication authentication) { + return CompletableFuture.supplyAsync(() -> { + try { + PolicyFetchResult result = PolicyFetchResult.EMPTY; + String scrollId = ""; + int fetchedResults = 0; + + while (PolicyFetchResult.EMPTY.equals(result) && scrollId != null) { + PolicyFetchResult tmpResult = fetchPolicies(query, count, scrollId.isEmpty() ? null : scrollId, authentication); + fetchedResults += tmpResult.getPolicies().size(); + scrollId = tmpResult.getScrollId(); + if (fetchedResults > start) { + result = tmpResult; + } + } + + return result; + } catch (Exception e) { + throw new RuntimeException("Failed to list policies", e); + } + }); } - public PolicyFetchResult fetchPolicies(int start, int count, String query, Authentication authentication) + public PolicyFetchResult fetchPolicies(int count, @Nullable String scrollId, Authentication authentication) + throws RemoteInvocationException, URISyntaxException { + return fetchPolicies("", count, scrollId, authentication); + } + + public PolicyFetchResult fetchPolicies(String query, int count, @Nullable String scrollId, Authentication authentication) throws RemoteInvocationException, URISyntaxException { - log.debug(String.format("Batch fetching policies. start: %s, count: %s ", start, count)); - // First fetch all policy urns from start - start + count - SearchResult result = - _entityClient.search(POLICY_ENTITY_NAME, query, null, POLICY_SORT_CRITERION, start, count, authentication, - new SearchFlags().setFulltext(true)); + log.debug(String.format("Batch fetching policies. count: %s, scroll: %s", count, scrollId)); + + // First fetch all policy urns + ScrollResult result = _entityClient.scrollAcrossEntities(List.of(POLICY_ENTITY_NAME), query, null, scrollId, + null, count, new SearchFlags().setSkipCache(true).setSkipAggregates(true) + .setSkipHighlighting(true).setFulltext(true), authentication); List policyUrns = result.getEntities().stream().map(SearchEntity::getEntity).collect(Collectors.toList()); if (policyUrns.isEmpty()) { - return new PolicyFetchResult(Collections.emptyList(), 0); + return PolicyFetchResult.EMPTY; } // Fetch DataHubPolicyInfo aspects for each urn @@ -64,7 +98,7 @@ public PolicyFetchResult fetchPolicies(int start, int count, String query, Authe .filter(Objects::nonNull) .map(this::extractPolicy) .filter(Objects::nonNull) - .collect(Collectors.toList()), result.getNumEntities()); + .collect(Collectors.toList()), result.getNumEntities(), result.getScrollId()); } private Policy extractPolicy(EntityResponse entityResponse) { @@ -82,6 +116,10 @@ private Policy extractPolicy(EntityResponse entityResponse) { public static class PolicyFetchResult { List policies; int total; + @Nullable + String scrollId; + + public static final PolicyFetchResult EMPTY = new PolicyFetchResult(Collections.emptyList(), 0, null); } @Value diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java index 24ecfa6fefc856..b0b206001209c7 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java @@ -21,7 +21,9 @@ import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; import com.linkedin.entity.client.EntityClient; +import com.linkedin.identity.RoleMembership; import com.linkedin.metadata.query.SearchFlags; +import com.linkedin.metadata.search.ScrollResult; import com.linkedin.metadata.search.SearchEntity; import com.linkedin.metadata.search.SearchEntityArray; import com.linkedin.metadata.search.SearchResult; @@ -35,6 +37,8 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; + import org.testng.annotations.BeforeMethod; import org.testng.annotations.Test; @@ -52,6 +56,7 @@ import static org.mockito.Mockito.when; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertFalse; public class DataHubAuthorizerTest { @@ -60,6 +65,7 @@ public class DataHubAuthorizerTest { private static final Urn PARENT_DOMAIN_URN = UrnUtils.getUrn("urn:li:domain:parent"); private static final Urn CHILD_DOMAIN_URN = UrnUtils.getUrn("urn:li:domain:child"); + private static final Urn USER_WITH_ADMIN_ROLE = UrnUtils.getUrn("urn:li:corpuser:user-with-admin"); private EntityClient _entityClient; private DataHubAuthorizer _dataHubAuthorizer; @@ -89,30 +95,76 @@ public void setupTest() throws Exception { final EnvelopedAspectMap childDomainPolicyAspectMap = new EnvelopedAspectMap(); childDomainPolicyAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(childDomainPolicy.data()))); - final SearchResult policySearchResult = new SearchResult(); - policySearchResult.setNumEntities(3); - policySearchResult.setEntities( - new SearchEntityArray( - ImmutableList.of( - new SearchEntity().setEntity(activePolicyUrn), - new SearchEntity().setEntity(inactivePolicyUrn), - new SearchEntity().setEntity(parentDomainPolicyUrn), - new SearchEntity().setEntity(childDomainPolicyUrn) - ) - ) - ); - - when(_entityClient.search(eq("dataHubPolicy"), eq(""), isNull(), any(), anyInt(), anyInt(), any(), - eq(new SearchFlags().setFulltext(true)))).thenReturn(policySearchResult); - when(_entityClient.batchGetV2(eq(POLICY_ENTITY_NAME), - eq(ImmutableSet.of(activePolicyUrn, inactivePolicyUrn, parentDomainPolicyUrn, childDomainPolicyUrn)), eq(null), any())).thenReturn( - ImmutableMap.of( - activePolicyUrn, new EntityResponse().setUrn(activePolicyUrn).setAspects(activeAspectMap), - inactivePolicyUrn, new EntityResponse().setUrn(inactivePolicyUrn).setAspects(inactiveAspectMap), - parentDomainPolicyUrn, new EntityResponse().setUrn(parentDomainPolicyUrn).setAspects(parentDomainPolicyAspectMap), - childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspects(childDomainPolicyAspectMap) - ) - ); + final Urn adminPolicyUrn = Urn.createFromString("urn:li:dataHubPolicy:4"); + final DataHubActorFilter actorFilter = new DataHubActorFilter(); + actorFilter.setRoles(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:dataHubRole:Admin")))); + final DataHubPolicyInfo adminPolicy = createDataHubPolicyInfoFor(true, ImmutableList.of("EDIT_USER_PROFILE"), null, actorFilter); + final EnvelopedAspectMap adminPolicyAspectMap = new EnvelopedAspectMap(); + adminPolicyAspectMap.put(DATAHUB_POLICY_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(adminPolicy.data()))); + + final ScrollResult policySearchResult1 = new ScrollResult() + .setScrollId("1") + .setNumEntities(5) + .setEntities( + new SearchEntityArray( + ImmutableList.of(new SearchEntity().setEntity(activePolicyUrn)))); + + final ScrollResult policySearchResult2 = new ScrollResult() + .setScrollId("2") + .setNumEntities(5) + .setEntities( + new SearchEntityArray( + ImmutableList.of(new SearchEntity().setEntity(inactivePolicyUrn)))); + + final ScrollResult policySearchResult3 = new ScrollResult() + .setScrollId("3") + .setNumEntities(5) + .setEntities( + new SearchEntityArray( + ImmutableList.of(new SearchEntity().setEntity(parentDomainPolicyUrn)))); + + final ScrollResult policySearchResult4 = new ScrollResult() + .setScrollId("4") + .setNumEntities(5) + .setEntities( + new SearchEntityArray( + ImmutableList.of( + new SearchEntity().setEntity(childDomainPolicyUrn)))); + + final ScrollResult policySearchResult5 = new ScrollResult() + .setNumEntities(5) + .setEntities( + new SearchEntityArray( + ImmutableList.of( + new SearchEntity().setEntity(adminPolicyUrn)))); + + when(_entityClient.scrollAcrossEntities(eq(List.of("dataHubPolicy")), eq(""), isNull(), any(), isNull(), + anyInt(), eq(new SearchFlags().setFulltext(true).setSkipAggregates(true).setSkipHighlighting(true).setSkipCache(true)), any())) + .thenReturn(policySearchResult1) + .thenReturn(policySearchResult2) + .thenReturn(policySearchResult3) + .thenReturn(policySearchResult4) + .thenReturn(policySearchResult5); + + when(_entityClient.batchGetV2(eq(POLICY_ENTITY_NAME), any(), eq(null), any())).thenAnswer(args -> { + Set inputUrns = args.getArgument(1); + Urn urn = inputUrns.stream().findFirst().get(); + + switch (urn.toString()) { + case "urn:li:dataHubPolicy:0": + return Map.of(activePolicyUrn, new EntityResponse().setUrn(activePolicyUrn).setAspects(activeAspectMap)); + case "urn:li:dataHubPolicy:1": + return Map.of(inactivePolicyUrn, new EntityResponse().setUrn(inactivePolicyUrn).setAspects(inactiveAspectMap)); + case "urn:li:dataHubPolicy:2": + return Map.of(parentDomainPolicyUrn, new EntityResponse().setUrn(parentDomainPolicyUrn).setAspects(parentDomainPolicyAspectMap)); + case "urn:li:dataHubPolicy:3": + return Map.of(childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspects(childDomainPolicyAspectMap)); + case "urn:li:dataHubPolicy:4": + return Map.of(adminPolicyUrn, new EntityResponse().setUrn(adminPolicyUrn).setAspects(adminPolicyAspectMap)); + default: + throw new IllegalStateException(); + } + }); final List userUrns = ImmutableList.of(Urn.createFromString("urn:li:corpuser:user3"), Urn.createFromString("urn:li:corpuser:user4")); final List groupUrns = ImmutableList.of(Urn.createFromString("urn:li:corpGroup:group3"), Urn.createFromString("urn:li:corpGroup:group4")); @@ -136,6 +188,10 @@ childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspec when(_entityClient.batchGetV2(any(), eq(Collections.singleton(PARENT_DOMAIN_URN)), eq(Collections.singleton(DOMAIN_PROPERTIES_ASPECT_NAME)), any())) .thenReturn(createDomainPropertiesBatchResponse(null)); + // Mocks to reach role membership for a user urn + when(_entityClient.batchGetV2(any(), eq(Collections.singleton(USER_WITH_ADMIN_ROLE)), eq(Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME)), any()) + ).thenReturn(createUserRoleMembershipBatchResponse(USER_WITH_ADMIN_ROLE, UrnUtils.getUrn("urn:li:dataHubRole:Admin"))); + final Authentication systemAuthentication = new Authentication( new Actor(ActorType.USER, DATAHUB_SYSTEM_CLIENT_ID), "" @@ -146,7 +202,8 @@ childDomainPolicyUrn, new EntityResponse().setUrn(childDomainPolicyUrn).setAspec _entityClient, 10, 10, - DataHubAuthorizer.AuthorizationMode.DEFAULT + DataHubAuthorizer.AuthorizationMode.DEFAULT, + 1 // force pagination logic ); _dataHubAuthorizer.init(Collections.emptyMap(), createAuthorizerContext(systemAuthentication, _entityClient)); _dataHubAuthorizer.invalidateCache(); @@ -270,6 +327,32 @@ public void testAuthorizedActorsActivePolicy() throws Exception { )); } + @Test + public void testAuthorizedRoleActivePolicy() throws Exception { + final AuthorizedActors actors = + _dataHubAuthorizer.authorizedActors("EDIT_USER_PROFILE", // Should be inside the active policy. + Optional.of(new EntitySpec("dataset", "urn:li:dataset:1"))); + + assertFalse(actors.isAllUsers()); + assertFalse(actors.isAllGroups()); + assertEquals(new HashSet<>(actors.getUsers()), ImmutableSet.of()); + assertEquals(new HashSet<>(actors.getGroups()), ImmutableSet.of()); + assertEquals(new HashSet<>(actors.getRoles()), ImmutableSet.of(UrnUtils.getUrn("urn:li:dataHubRole:Admin"))); + } + + @Test + public void testAuthorizationBasedOnRoleIsAllowed() { + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); + + AuthorizationRequest request = new AuthorizationRequest( + USER_WITH_ADMIN_ROLE.toString(), + "EDIT_USER_PROFILE", + Optional.of(resourceSpec) + ); + + assertEquals(_dataHubAuthorizer.authorize(request).getType(), AuthorizationResult.Type.ALLOW); + } + @Test public void testAuthorizationOnDomainWithPrivilegeIsAllowed() { EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); @@ -310,13 +393,6 @@ public void testAuthorizationOnDomainWithoutPrivilegeIsDenied() { } private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List privileges, @Nullable final Urn domain) throws Exception { - final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo(); - dataHubPolicyInfo.setType(METADATA_POLICY_TYPE); - dataHubPolicyInfo.setState(active ? ACTIVE_POLICY_STATE : INACTIVE_POLICY_STATE); - dataHubPolicyInfo.setPrivileges(new StringArray(privileges)); - dataHubPolicyInfo.setDisplayName("My Test Display"); - dataHubPolicyInfo.setDescription("My test display!"); - dataHubPolicyInfo.setEditable(true); List users = ImmutableList.of(Urn.createFromString("urn:li:corpuser:user1"), Urn.createFromString("urn:li:corpuser:user2")); List groups = ImmutableList.of(Urn.createFromString("urn:li:corpGroup:group1"), Urn.createFromString("urn:li:corpGroup:group2")); @@ -327,6 +403,20 @@ private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List p actorFilter.setAllGroups(true); actorFilter.setUsers(new UrnArray(users)); actorFilter.setGroups(new UrnArray(groups)); + + return createDataHubPolicyInfoFor(active, privileges, domain, actorFilter); + } + + private DataHubPolicyInfo createDataHubPolicyInfoFor(boolean active, List privileges, + @Nullable final Urn domain, DataHubActorFilter actorFilter) throws Exception { + final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo(); + dataHubPolicyInfo.setType(METADATA_POLICY_TYPE); + dataHubPolicyInfo.setState(active ? ACTIVE_POLICY_STATE : INACTIVE_POLICY_STATE); + dataHubPolicyInfo.setPrivileges(new StringArray(privileges)); + dataHubPolicyInfo.setDisplayName("My Test Display"); + dataHubPolicyInfo.setDescription("My test display!"); + dataHubPolicyInfo.setEditable(true); + dataHubPolicyInfo.setActors(actorFilter); final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); @@ -397,6 +487,21 @@ private Map createDomainPropertiesBatchResponse(@Nullable f return batchResponse; } + private Map createUserRoleMembershipBatchResponse(final Urn userUrn, @Nullable final Urn roleUrn) { + final Map batchResponse = new HashMap<>(); + final EntityResponse response = new EntityResponse(); + EnvelopedAspectMap aspectMap = new EnvelopedAspectMap(); + final RoleMembership membership = new RoleMembership(); + if (roleUrn != null) { + membership.setRoles(new UrnArray(roleUrn)); + } + aspectMap.put(ROLE_MEMBERSHIP_ASPECT_NAME, new EnvelopedAspect() + .setValue(new com.linkedin.entity.Aspect(membership.data()))); + response.setAspects(aspectMap); + batchResponse.put(userUrn, response); + return batchResponse; + } + private AuthorizerContext createAuthorizerContext(final Authentication systemAuthentication, final EntityClient entityClient) { return new AuthorizerContext(Collections.emptyMap(), new DefaultEntitySpecResolver(systemAuthentication, entityClient)); } diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java index be8c948f8ef897..2790c16ba75e62 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java @@ -1041,6 +1041,7 @@ public void testGetMatchingActorsResourceMatch() throws Exception { Urn.createFromString("urn:li:corpuser:user2")))); actorFilter.setGroups(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:corpGroup:group1"), Urn.createFromString("urn:li:corpGroup:group2")))); + actorFilter.setRoles(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:role:Admin")))); dataHubPolicyInfo.setActors(actorFilter); final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); @@ -1056,8 +1057,8 @@ public void testGetMatchingActorsResourceMatch() throws Exception { Collections.emptySet(), Collections.emptySet()); PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); - assertTrue(actors.allUsers()); - assertTrue(actors.allGroups()); + assertTrue(actors.getAllUsers()); + assertTrue(actors.getAllGroups()); assertEquals(actors.getUsers(), ImmutableList.of(Urn.createFromString("urn:li:corpuser:user1"), Urn.createFromString("urn:li:corpuser:user2"), @@ -1068,6 +1069,8 @@ public void testGetMatchingActorsResourceMatch() throws Exception { Urn.createFromString("urn:li:corpGroup:group2"), Urn.createFromString(AUTHORIZED_GROUP) // Resource Owner )); + assertEquals(actors.getRoles(), ImmutableList.of(Urn.createFromString("urn:li:role:Admin"))); + // Verify aspect client called, entity client not called. verify(_entityClient, times(0)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), eq(null), any()); @@ -1106,15 +1109,58 @@ public void testGetMatchingActorsNoResourceMatch() throws Exception { buildEntityResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); - assertFalse(actors.allUsers()); - assertFalse(actors.allGroups()); + assertFalse(actors.getAllUsers()); + assertFalse(actors.getAllGroups()); assertEquals(actors.getUsers(), Collections.emptyList()); assertEquals(actors.getGroups(), Collections.emptyList()); + //assertEquals(actors.getRoles(), Collections.emptyList()); // Verify no network calls verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } + @Test + public void testGetMatchingActorsByRoleResourceMatch() throws Exception { + final DataHubPolicyInfo dataHubPolicyInfo = new DataHubPolicyInfo(); + dataHubPolicyInfo.setType(METADATA_POLICY_TYPE); + dataHubPolicyInfo.setState(ACTIVE_POLICY_STATE); + dataHubPolicyInfo.setPrivileges(new StringArray("EDIT_ENTITY_TAGS")); + dataHubPolicyInfo.setDisplayName("My Test Display"); + dataHubPolicyInfo.setDescription("My test display!"); + dataHubPolicyInfo.setEditable(true); + + final DataHubActorFilter actorFilter = new DataHubActorFilter(); + actorFilter.setResourceOwners(true); + actorFilter.setAllUsers(false); + actorFilter.setAllGroups(false); + actorFilter.setRoles(new UrnArray(ImmutableList.of(Urn.createFromString("urn:li:dataHubRole:Editor")))); + dataHubPolicyInfo.setActors(actorFilter); + + final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); + resourceFilter.setAllResources(false); + resourceFilter.setType("dataset"); + StringArray resourceUrns = new StringArray(); + resourceUrns.add(RESOURCE_URN); + resourceFilter.setResources(resourceUrns); + dataHubPolicyInfo.setResources(resourceFilter); + + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(), + Collections.emptySet(), Collections.emptySet()); + + PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); + + assertFalse(actors.getAllUsers()); + assertFalse(actors.getAllGroups()); + + assertEquals(actors.getUsers(), ImmutableList.of()); + assertEquals(actors.getGroups(), ImmutableList.of()); + assertEquals(actors.getRoles(), ImmutableList.of(Urn.createFromString("urn:li:dataHubRole:Editor"))); + + // Verify aspect client called, entity client not called. + verify(_entityClient, times(0)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), + eq(null), any()); + } + private Ownership createOwnershipAspect(final Boolean addUserOwner, final Boolean addGroupOwner) throws Exception { final Ownership ownershipAspect = new Ownership(); final OwnerArray owners = new OwnerArray(); diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index b817208672e08b..46aa02d98572e6 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -11,6 +11,8 @@ authentication: # Key used to validate incoming tokens. Should typically be the same as authentication.tokenService.signingKey signingKey: ${DATAHUB_TOKEN_SERVICE_SIGNING_KEY:WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94=} salt: ${DATAHUB_TOKEN_SERVICE_SALT:ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O=} + # Required for unauthenticated health check endpoints - best not to remove. + - type: com.datahub.authentication.authenticator.HealthStatusAuthenticator # Normally failures are only warnings, enable this to throw them. logAuthenticatorExceptions: ${METADATA_SERVICE_AUTHENTICATOR_EXCEPTIONS_ENABLED:false} @@ -37,6 +39,7 @@ authorization: defaultAuthorizer: enabled: ${AUTH_POLICIES_ENABLED:true} cacheRefreshIntervalSecs: ${POLICY_CACHE_REFRESH_INTERVAL_SECONDS:120} + cachePolicyFetchSize: ${POLICY_CACHE_FETCH_SIZE:1000} # Enables authorization of reads, writes, and deletes on REST APIs. Defaults to false for backwards compatibility, but should become true down the road restApiAuthorization: ${REST_API_AUTHORIZATION_ENABLED:false} @@ -248,6 +251,7 @@ neo4j: username: ${NEO4J_USERNAME:neo4j} password: ${NEO4J_PASSWORD:datahub} uri: ${NEO4J_URI:bolt://localhost} + database: ${NEO4J_DATABASE:graph.db} maxConnectionPoolSize: ${NEO4J_MAX_CONNECTION_POOL_SIZE:100} maxConnectionAcquisitionTimeout: ${NEO4J_MAX_CONNECTION_ACQUISITION_TIMEOUT_IN_SECONDS:60} maxConnectionLifetimeInSeconds: ${NEO4j_MAX_CONNECTION_LIFETIME_IN_SECONDS:3600} @@ -282,6 +286,8 @@ bootstrap: enabled: ${UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED:false} # enable to run the upgrade to migrate legacy default browse paths to new ones backfillBrowsePathsV2: enabled: ${BACKFILL_BROWSE_PATHS_V2:false} # Enables running the backfill of browsePathsV2 upgrade step. There are concerns about the load of this step so hiding it behind a flag. Deprecating in favor of running through SystemUpdate + reprocessDefaultBrowsePathsV2: + enabled: ${REPROCESS_DEFAULT_BROWSE_PATHS_V2:false} # reprocess V2 browse paths which were set to the default: {"path":[{"id":"Default"}]} policies: file: ${BOOTSTRAP_POLICIES_FILE:classpath:boot/policies.json} # eg for local file diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java index 5b298a453547a7..663234e2519faf 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubAuthorizerFactory.java @@ -32,6 +32,9 @@ public class DataHubAuthorizerFactory { @Value("${authorization.defaultAuthorizer.cacheRefreshIntervalSecs}") private Integer policyCacheRefreshIntervalSeconds; + @Value("${authorization.defaultAuthorizer.cachePolicyFetchSize}") + private Integer policyCacheFetchSize; + @Value("${authorization.defaultAuthorizer.enabled:true}") private Boolean policiesEnabled; @@ -44,6 +47,6 @@ protected DataHubAuthorizer getInstance() { : DataHubAuthorizer.AuthorizationMode.ALLOW_ALL; return new DataHubAuthorizer(systemAuthentication, entityClient, 10, - policyCacheRefreshIntervalSeconds, mode); + policyCacheRefreshIntervalSeconds, mode, policyCacheFetchSize); } } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java index e62dfd50f897d7..87670ce10f481a 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/common/Neo4jGraphServiceFactory.java @@ -6,8 +6,10 @@ import com.linkedin.metadata.models.registry.EntityRegistry; import javax.annotation.Nonnull; import org.neo4j.driver.Driver; +import org.neo4j.driver.SessionConfig; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; @@ -24,10 +26,13 @@ public class Neo4jGraphServiceFactory { @Qualifier("entityRegistry") private EntityRegistry entityRegistry; + @Value("${neo4j.database}") + private String neo4jDatabase; + @Bean(name = "neo4jGraphService") @Nonnull protected Neo4jGraphService getInstance() { LineageRegistry lineageRegistry = new LineageRegistry(entityRegistry); - return new Neo4jGraphService(lineageRegistry, neo4jDriver); + return new Neo4jGraphService(lineageRegistry, neo4jDriver, SessionConfig.forDatabase(neo4jDatabase)); } } diff --git a/metadata-service/health-servlet/build.gradle b/metadata-service/health-servlet/build.gradle deleted file mode 100644 index 6095f724b3cd44..00000000000000 --- a/metadata-service/health-servlet/build.gradle +++ /dev/null @@ -1,22 +0,0 @@ -apply plugin: 'java' - -dependencies { - - implementation project(':metadata-service:factories') - - implementation externalDependency.guava - implementation externalDependency.reflections - implementation externalDependency.springBoot - implementation externalDependency.springCore - implementation externalDependency.springDocUI - implementation externalDependency.springWeb - implementation externalDependency.springWebMVC - implementation externalDependency.springBeans - implementation externalDependency.springContext - implementation externalDependency.slf4jApi - compileOnly externalDependency.lombok - implementation externalDependency.antlr4Runtime - implementation externalDependency.antlr4 - - annotationProcessor externalDependency.lombok -} \ No newline at end of file diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java index 71e8c79a2275a5..e4f49df90c3921 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/config/SpringWebConfig.java @@ -44,7 +44,6 @@ public GroupedOpenApi defaultOpenApiGroup() { .group("default") .packagesToExclude( "io.datahubproject.openapi.operations", - "com.datahub.health", "io.datahubproject.openapi.health" ).build(); } @@ -55,7 +54,6 @@ public GroupedOpenApi operationsOpenApiGroup() { .group("operations") .packagesToScan( "io.datahubproject.openapi.operations", - "com.datahub.health", "io.datahubproject.openapi.health" ).build(); } diff --git a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/health/HealthCheckController.java similarity index 79% rename from metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java rename to metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/health/HealthCheckController.java index c200e63e0d4977..c90603bf88c31e 100644 --- a/metadata-service/health-servlet/src/main/java/com/datahub/health/controller/HealthCheckController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/health/HealthCheckController.java @@ -1,5 +1,6 @@ -package com.datahub.health.controller; +package io.datahubproject.openapi.health; +import com.google.common.base.Supplier; import com.google.common.base.Suppliers; import com.linkedin.gms.factory.config.ConfigurationProvider; import io.swagger.v3.oas.annotations.tags.Tag; @@ -9,7 +10,6 @@ import java.util.List; import java.util.Map; import java.util.concurrent.TimeUnit; -import java.util.function.Supplier; import org.opensearch.action.admin.cluster.health.ClusterHealthRequest; import org.opensearch.action.admin.cluster.health.ClusterHealthResponse; @@ -27,7 +27,7 @@ @RestController -@RequestMapping("/check") +@RequestMapping("/") @Tag(name = "HealthCheck", description = "An API for checking health of GMS and its clients.") public class HealthCheckController { @Autowired @@ -41,6 +41,12 @@ public HealthCheckController(ConfigurationProvider config) { this::getElasticHealth, config.getHealthCheck().getCacheDurationSeconds(), TimeUnit.SECONDS); } + @GetMapping(path = "/check/ready", produces = MediaType.APPLICATION_JSON_VALUE) + public ResponseEntity getCombinedHealthCheck(String... checks) { + return ResponseEntity.status(getCombinedDebug(checks).getStatusCode()) + .body(getCombinedDebug(checks).getStatusCode().is2xxSuccessful()); + } + /** * Combined health check endpoint for checking GMS clients. * For now, just checks the health of the ElasticSearch client @@ -48,11 +54,10 @@ public HealthCheckController(ConfigurationProvider config) { * that component). The status code will be 200 if all components are okay, and 500 if one or more components are not * healthy. */ - @GetMapping(path = "/ready", produces = MediaType.APPLICATION_JSON_VALUE) - public ResponseEntity>> getCombinedHealthCheck(String... checks) { - + @GetMapping(path = "/debug/ready", produces = MediaType.APPLICATION_JSON_VALUE) + public ResponseEntity>> getCombinedDebug(String... checks) { Map>> healthChecks = new HashMap<>(); - healthChecks.put("elasticsearch", this::getElasticHealthWithCache); + healthChecks.put("elasticsearch", this::getElasticDebugWithCache); // Add new components here List componentsToCheck = checks != null && checks.length > 0 @@ -67,7 +72,6 @@ public ResponseEntity>> getCombinedHealthChec .get()); } - boolean isHealthy = componentHealth.values().stream().allMatch(resp -> resp.getStatusCode() == HttpStatus.OK); if (isHealthy) { return ResponseEntity.ok(componentHealth); @@ -75,12 +79,18 @@ public ResponseEntity>> getCombinedHealthChec return ResponseEntity.status(HttpStatus.SERVICE_UNAVAILABLE).body(componentHealth); } + @GetMapping(path = "/check/elastic", produces = MediaType.APPLICATION_JSON_VALUE) + public ResponseEntity getElasticHealthWithCache() { + return ResponseEntity.status(getElasticDebugWithCache().getStatusCode()) + .body(getElasticDebugWithCache().getStatusCode().is2xxSuccessful()); + } + /** * Checks the memoized cache for the latest elastic health check result * @return The ResponseEntity containing the health check result */ - @GetMapping(path = "/elastic", produces = MediaType.APPLICATION_JSON_VALUE) - public ResponseEntity getElasticHealthWithCache() { + @GetMapping(path = "/debug/elastic", produces = MediaType.APPLICATION_JSON_VALUE) + public ResponseEntity getElasticDebugWithCache() { return this.memoizedSupplier.get(); } diff --git a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java index 442ac1b0d287b3..e5f3e223ff505d 100644 --- a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java +++ b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java @@ -75,7 +75,7 @@ public AuthorizationResult authorize(@Nonnull AuthorizationRequest request) { @Override public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { - return new AuthorizedActors("ALL", null, null, true, true); + return new AuthorizedActors("ALL", null, null, null, true, true); } } diff --git a/metadata-service/restli-api/build.gradle b/metadata-service/restli-api/build.gradle index ed4f4118dba307..f182d11b6baebf 100644 --- a/metadata-service/restli-api/build.gradle +++ b/metadata-service/restli-api/build.gradle @@ -8,4 +8,10 @@ dependencies { restClientCompile spec.product.pegasus.d2 restClientCompile spec.product.pegasus.restliClient + + constraints { + restClientCompile(externalDependency.zookeeper) { + because("CVE-2023-44981") + } + } } \ No newline at end of file diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java index b9661ec75e1b1f..84d0ed6b9594df 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/EntityClient.java @@ -241,7 +241,7 @@ public SearchResult searchAcrossEntities(@Nonnull List entities, @Nonnul */ @Nonnull ScrollResult scrollAcrossEntities(@Nonnull List entities, @Nonnull String input, - @Nullable Filter filter, @Nullable String scrollId, @Nonnull String keepAlive, int count, @Nullable SearchFlags searchFlags, + @Nullable Filter filter, @Nullable String scrollId, @Nullable String keepAlive, int count, @Nullable SearchFlags searchFlags, @Nonnull Authentication authentication) throws RemoteInvocationException; diff --git a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java index 47a00e711a9350..2716e27518fcc5 100644 --- a/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java +++ b/metadata-service/restli-client/src/main/java/com/linkedin/entity/client/RestliEntityClient.java @@ -482,11 +482,11 @@ public SearchResult searchAcrossEntities(@Nonnull List entities, @Nonnul @Nonnull @Override public ScrollResult scrollAcrossEntities(@Nonnull List entities, @Nonnull String input, - @Nullable Filter filter, @Nullable String scrollId, @Nonnull String keepAlive, int count, + @Nullable Filter filter, @Nullable String scrollId, @Nullable String keepAlive, int count, @Nullable SearchFlags searchFlags, @Nonnull Authentication authentication) throws RemoteInvocationException { final EntitiesDoScrollAcrossEntitiesRequestBuilder requestBuilder = - ENTITIES_REQUEST_BUILDERS.actionScrollAcrossEntities().inputParam(input).countParam(count).keepAliveParam(keepAlive); + ENTITIES_REQUEST_BUILDERS.actionScrollAcrossEntities().inputParam(input).countParam(count); if (entities != null) { requestBuilder.entitiesParam(new StringArray(entities)); @@ -500,6 +500,9 @@ public ScrollResult scrollAcrossEntities(@Nonnull List entities, @Nonnul if (searchFlags != null) { requestBuilder.searchFlagsParam(searchFlags); } + if (keepAlive != null) { + requestBuilder.keepAliveParam(keepAlive); + } return sendClientRequest(requestBuilder, authentication).getEntity(); } diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java index 64f59780b887f3..cbfeeaef860d34 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java @@ -193,7 +193,7 @@ BrowseResult browse(@Nonnull String entityName, @Nonnull String path, @Nullable */ @Nonnull ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size, @Nullable SearchFlags searchFlags); + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size, @Nullable SearchFlags searchFlags); /** * Gets a list of documents that match given search request. The results are aggregated and filters are applied to the @@ -210,7 +210,7 @@ ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String inpu */ @Nonnull ScrollResult structuredScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size, @Nullable SearchFlags searchFlags); + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size, @Nullable SearchFlags searchFlags); /** * Max result size returned by the underlying search backend diff --git a/metadata-service/war/build.gradle b/metadata-service/war/build.gradle index 122c2b9d5357be..54e95fdcfe5798 100644 --- a/metadata-service/war/build.gradle +++ b/metadata-service/war/build.gradle @@ -17,7 +17,6 @@ dependencies { runtimeOnly project(':metadata-service:servlet') runtimeOnly project(':metadata-service:auth-servlet-impl') runtimeOnly project(':metadata-service:graphql-servlet-impl') - runtimeOnly project(':metadata-service:health-servlet') runtimeOnly project(':metadata-service:openapi-servlet') runtimeOnly project(':metadata-service:openapi-entity-servlet') runtimeOnly project(':metadata-service:openapi-analytics-servlet') diff --git a/metadata-service/war/src/main/resources/boot/data_platforms.json b/metadata-service/war/src/main/resources/boot/data_platforms.json index 3d956c5774dedb..3c70eda8561b86 100644 --- a/metadata-service/war/src/main/resources/boot/data_platforms.json +++ b/metadata-service/war/src/main/resources/boot/data_platforms.json @@ -564,5 +564,15 @@ "type": "KEY_VALUE_STORE", "logoUrl": "/assets/platforms/dynamodblogo.png" } + }, + { + "urn": "urn:li:dataPlatform:fivetran", + "aspect": { + "datasetNameDelimiter": ".", + "name": "fivetran", + "displayName": "Fivetran", + "type": "OTHERS", + "logoUrl": "/assets/platforms/fivetranlogo.png" + } } ] diff --git a/metadata-service/war/src/main/webapp/WEB-INF/openapiServlet-servlet.xml b/metadata-service/war/src/main/webapp/WEB-INF/openapiServlet-servlet.xml index 3077cfb0626387..fb2bc6c0336cde 100644 --- a/metadata-service/war/src/main/webapp/WEB-INF/openapiServlet-servlet.xml +++ b/metadata-service/war/src/main/webapp/WEB-INF/openapiServlet-servlet.xml @@ -3,7 +3,7 @@ xmlns:context="http://www.springframework.org/schema/context" xsi:schemaLocation="http://www.springframework.org/schema/beans http://www.springframework.org/schema/beans/spring-beans-3.0.xsd http://www.springframework.org/schema/context http://www.springframework.org/schema/context/spring-context-3.0.xsd"> - + diff --git a/settings.gradle b/settings.gradle index 52de461383b5e8..d2844fe00cdbc3 100644 --- a/settings.gradle +++ b/settings.gradle @@ -8,7 +8,6 @@ include 'metadata-service:auth-config' include 'metadata-service:auth-impl' include 'metadata-service:auth-filter' include 'metadata-service:auth-servlet-impl' -include 'metadata-service:health-servlet' include 'metadata-service:restli-api' include 'metadata-service:restli-client' include 'metadata-service:restli-servlet-impl' diff --git a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js index aeceaf99be889c..c6e9d93f71b8c3 100644 --- a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js +++ b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js @@ -27,7 +27,7 @@ describe("glossary sidebar navigation test", () => { cy.waitTextVisible("Moved Glossary Term!"); // Ensure the new term is under the parent term group in the navigation sidebar - cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).click(); + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).click().wait(3000); cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTerm).should("be.visible"); // Move a term group from the root level to be under a parent term group @@ -41,7 +41,7 @@ describe("glossary sidebar navigation test", () => { cy.waitTextVisible("Moved Term Group!"); // Ensure it is no longer on the sidebar navigator at the top level but shows up under the new parent - cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryParentGroup).click(); + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryParentGroup).click().wait(3000); cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTermGroup).should("be.visible"); // Delete a term group diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/impact_analysis.js b/smoke-test/tests/cypress/cypress/e2e/lineage/impact_analysis.js index defb786d1fa5db..784ccf8f0f87db 100644 --- a/smoke-test/tests/cypress/cypress/e2e/lineage/impact_analysis.js +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/impact_analysis.js @@ -21,6 +21,10 @@ const startAtDataSetLineage = () => { } describe("impact analysis", () => { + beforeEach(() => { + cy.on('uncaught:exception', (err, runnable) => { return false; }); + }); + it("can see 1 hop of lineage by default", () => { startAtDataSetLineage() diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js index 9559435ff01c85..8d689c7e2303c4 100644 --- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js +++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js @@ -81,7 +81,7 @@ describe("create and manage group", () => { cy.focused().type(expected_name); cy.get(".ant-select-item-option").contains(expected_name, { matchCase: false }).click(); cy.focused().blur(); - cy.contains(expected_name).should("have.length", 1); + cy.contains(expected_name, { matchCase: false }).should("have.length", 1); cy.get('[role="dialog"] button').contains("Done").click(); cy.waitTextVisible("Owners Added"); cy.contains(expected_name, { matchCase: false }).should("be.visible"); diff --git a/smoke-test/tests/privileges/test_privileges.py b/smoke-test/tests/privileges/test_privileges.py index 13d6b6cf3415a4..d0f00734ae9f37 100644 --- a/smoke-test/tests/privileges/test_privileges.py +++ b/smoke-test/tests/privileges/test_privileges.py @@ -52,6 +52,20 @@ def privileges_and_test_user_setup(admin_session): wait_for_writes_to_sync() +@tenacity.retry( + stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec) +) +def _ensure_cant_perform_action(session, json,assertion_key): + action_response = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=json) + action_response.raise_for_status() + action_data = action_response.json() + + assert action_data["errors"][0]["extensions"]["code"] == 403 + assert action_data["errors"][0]["extensions"]["type"] == "UNAUTHORIZED" + assert action_data["data"][assertion_key] == None + + @tenacity.retry( stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec) ) @@ -67,20 +81,6 @@ def _ensure_can_create_secret(session, json, urn): assert secret_data["data"]["createSecret"] == urn -@tenacity.retry( - stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec) -) -def _ensure_cant_create_secret(session, json): - create_secret_response = session.post( - f"{get_frontend_url()}/api/v2/graphql", json=json) - create_secret_response.raise_for_status() - create_secret_data = create_secret_response.json() - - assert create_secret_data["errors"][0]["extensions"]["code"] == 403 - assert create_secret_data["errors"][0]["extensions"]["type"] == "UNAUTHORIZED" - assert create_secret_data["data"]["createSecret"] == None - - @tenacity.retry( stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec) ) @@ -99,17 +99,34 @@ def _ensure_can_create_ingestion_source(session, json): @tenacity.retry( - stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec) + stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec) ) -def _ensure_cant_create_ingestion_source(session, json): - create_source_response = session.post( +def _ensure_can_create_access_token(session, json): + create_access_token_success = session.post( f"{get_frontend_url()}/api/v2/graphql", json=json) - create_source_response.raise_for_status() - create_source_data = create_source_response.json() + create_access_token_success.raise_for_status() + ingestion_data = create_access_token_success.json() + + assert ingestion_data + assert ingestion_data["data"] + assert ingestion_data["data"]["createAccessToken"] + assert ingestion_data["data"]["createAccessToken"]["accessToken"] is not None + assert ingestion_data["data"]["createAccessToken"]["__typename"] == "AccessToken" - assert create_source_data["errors"][0]["extensions"]["code"] == 403 - assert create_source_data["errors"][0]["extensions"]["type"] == "UNAUTHORIZED" - assert create_source_data["data"]["createIngestionSource"] == None + +@tenacity.retry( + stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec) +) +def _ensure_can_create_user_policy(session, json): + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + response.raise_for_status() + res_data = response.json() + + assert res_data + assert res_data["data"] + assert res_data["data"]["createPolicy"] is not None + + return res_data["data"]["createPolicy"] @pytest.mark.dependency(depends=["test_healthchecks"]) @@ -132,7 +149,7 @@ def test_privilege_to_create_and_manage_secrets(): } }, } - _ensure_cant_create_secret(user_session, create_secret) + _ensure_cant_perform_action(user_session, create_secret,"createSecret") # Assign privileges to the new user to manage secrets @@ -166,7 +183,7 @@ def test_privilege_to_create_and_manage_secrets(): remove_policy(policy_urn, admin_session) # Ensure user can't create secret after policy is removed - _ensure_cant_create_secret(user_session, create_secret) + _ensure_cant_perform_action(user_session, create_secret,"createSecret") @pytest.mark.dependency(depends=["test_healthchecks"]) @@ -182,11 +199,18 @@ def test_privilege_to_create_and_manage_ingestion_source(): createIngestionSource(input: $input)\n}""", "variables": {"input":{"type":"snowflake","name":"test","config": {"recipe": - "{\"source\":{\"type\":\"snowflake\",\"config\":{\"account_id\":null,\"include_table_lineage\":true,\"include_view_lineage\":true,\"include_tables\":true,\"include_views\":true,\"profiling\":{\"enabled\":true,\"profile_table_level_only\":true},\"stateful_ingestion\":{\"enabled\":true}}}}", + """{\"source\":{\"type\":\"snowflake\",\"config\":{ + \"account_id\":null, + \"include_table_lineage\":true, + \"include_view_lineage\":true, + \"include_tables\":true, + \"include_views\":true, + \"profiling\":{\"enabled\":true,\"profile_table_level_only\":true}, + \"stateful_ingestion\":{\"enabled\":true}}}}""", "executorId":"default","debugMode":False,"extraArgs":[]}}}, } - _ensure_cant_create_ingestion_source(user_session, create_ingestion_source) + _ensure_cant_perform_action(user_session, create_ingestion_source, "createIngestionSource") # Assign privileges to the new user to manage ingestion source @@ -201,7 +225,14 @@ def test_privilege_to_create_and_manage_ingestion_source(): updateIngestionSource(urn: $urn, input: $input)\n}""", "variables": {"urn":ingestion_source_urn, "input":{"type":"snowflake","name":"test updated", - "config":{"recipe":"{\"source\":{\"type\":\"snowflake\",\"config\":{\"account_id\":null,\"include_table_lineage\":true,\"include_view_lineage\":true,\"include_tables\":true,\"include_views\":true,\"profiling\":{\"enabled\":true,\"profile_table_level_only\":true},\"stateful_ingestion\":{\"enabled\":true}}}}", + "config":{"recipe":"""{\"source\":{\"type\":\"snowflake\",\"config\":{ + \"account_id\":null, + \"include_table_lineage\":true, + \"include_view_lineage\":true, + \"include_tables\":true, + \"include_views\":true, + \"profiling\":{\"enabled\":true,\"profile_table_level_only\":true}, + \"stateful_ingestion\":{\"enabled\":true}}}}""", "executorId":"default","debugMode":False,"extraArgs":[]}}} } @@ -238,4 +269,182 @@ def test_privilege_to_create_and_manage_ingestion_source(): remove_policy(policy_urn, admin_session) # Ensure that user can't create ingestion source after policy is removed - _ensure_cant_create_ingestion_source(user_session, create_ingestion_source) \ No newline at end of file + _ensure_cant_perform_action(user_session, create_ingestion_source, "createIngestionSource") + + +@pytest.mark.dependency(depends=["test_healthchecks"]) +def test_privilege_to_create_and_manage_access_tokens(): + + (admin_user, admin_pass) = get_admin_credentials() + admin_session = login_as(admin_user, admin_pass) + user_session = login_as("user", "user") + + + # Verify new user can't create access token + create_access_token = { + "query": """mutation createAccessToken($input: CreateAccessTokenInput!) {\n + createAccessToken(input: $input) {\n accessToken\n __typename\n }\n}\n""", + "variables": {"input":{"actorUrn":"urn:li:corpuser:user", + "type":"PERSONAL", + "duration":"ONE_MONTH", + "name":"test", + "description":"test"}} + } + + _ensure_cant_perform_action(user_session, create_access_token,"createAccessToken") + + + # Assign privileges to the new user to create and manage access tokens + policy_urn = create_user_policy("urn:li:corpuser:user", ["MANAGE_ACCESS_TOKENS"], admin_session) + + + # Verify new user can create and manage access token(create, revoke) + # Create a access token + _ensure_can_create_access_token(user_session, create_access_token) + + + # List access tokens first to get token id + list_access_tokens = { + "query": """query listAccessTokens($input: ListAccessTokenInput!) {\n + listAccessTokens(input: $input) {\n + start\n count\n total\n tokens {\n urn\n type\n + id\n name\n description\n actorUrn\n ownerUrn\n + createdAt\n expiresAt\n __typename\n }\n __typename\n }\n}\n""", + "variables": { + "input":{ + "start":0,"count":10,"filters":[{ + "field":"ownerUrn", + "values":["urn:li:corpuser:user"]}]} + } + } + + list_tokens_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=list_access_tokens) + list_tokens_response.raise_for_status() + list_tokens_data = list_tokens_response.json() + + assert list_tokens_data + assert list_tokens_data["data"] + assert list_tokens_data["data"]["listAccessTokens"]["tokens"][0]["id"] is not None + + access_token_id = list_tokens_data["data"]["listAccessTokens"]["tokens"][0]["id"] + + + # Revoke access token + revoke_access_token = { + "query": "mutation revokeAccessToken($tokenId: String!) {\n revokeAccessToken(tokenId: $tokenId)\n}\n", + "variables": { + "tokenId": access_token_id + }, + } + + revoke_token_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=revoke_access_token) + revoke_token_response.raise_for_status() + revoke_token_data = revoke_token_response.json() + + assert revoke_token_data + assert revoke_token_data["data"] + assert revoke_token_data["data"]["revokeAccessToken"] + assert revoke_token_data["data"]["revokeAccessToken"] is True + + + # Remove the policy + remove_policy(policy_urn, admin_session) + + + # Ensure that user can't create access token after policy is removed + _ensure_cant_perform_action(user_session, create_access_token,"createAccessToken") + + +@pytest.mark.dependency(depends=["test_healthchecks"]) +def test_privilege_to_create_and_manage_policies(): + + (admin_user, admin_pass) = get_admin_credentials() + admin_session = login_as(admin_user, admin_pass) + user_session = login_as("user", "user") + + + # Verify new user can't create a policy + create_policy = { + "query": """mutation createPolicy($input: PolicyUpdateInput!) {\n + createPolicy(input: $input) }""", + "variables": { + "input": { + "type": "PLATFORM", + "name": "Policy Name", + "description": "Policy Description", + "state": "ACTIVE", + "resources": {"filter":{"criteria":[]}}, + "privileges": ["MANAGE_POLICIES"], + "actors": { + "users": [], + "resourceOwners": False, + "allUsers": True, + "allGroups": False, + }, + } + }, + } + + _ensure_cant_perform_action(user_session, create_policy,"createPolicy") + + + # Assign privileges to the new user to create and manage policies + admin_policy_urn = create_user_policy("urn:li:corpuser:user", ["MANAGE_POLICIES"], admin_session) + + + # Verify new user can create and manage policy(create, edit, delete) + # Create a policy + user_policy_urn = _ensure_can_create_user_policy(user_session, create_policy) + + # Edit a policy + edit_policy = { + "query": """mutation updatePolicy($urn: String!, $input: PolicyUpdateInput!) {\n + updatePolicy(urn: $urn, input: $input) }""", + "variables": { + "urn": user_policy_urn, + "input": { + "type": "PLATFORM", + "state": "INACTIVE", + "name": "Policy Name test", + "description": "Policy Description updated", + "privileges": ["MANAGE_POLICIES"], + "actors": { + "users": [], + "groups": None, + "resourceOwners": False, + "allUsers": True, + "allGroups": False, + "resourceOwnersTypes": None, + }, + }, + }, + } + edit_policy_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=edit_policy) + edit_policy_response.raise_for_status() + res_data = edit_policy_response.json() + + assert res_data + assert res_data["data"] + assert res_data["data"]["updatePolicy"] == user_policy_urn + + # Delete a policy + remove_user_policy = { + "query": "mutation deletePolicy($urn: String!) {\n deletePolicy(urn: $urn)\n}\n", + "variables":{"urn":user_policy_urn} + } + + remove_policy_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=remove_user_policy) + remove_policy_response.raise_for_status() + res_data = remove_policy_response.json() + + assert res_data + assert res_data["data"] + assert res_data["data"]["deletePolicy"] == user_policy_urn + + + # Remove the user privilege by admin + remove_policy(admin_policy_urn, admin_session) + + + # Ensure that user can't create a policy after privilege is removed by admin + _ensure_cant_perform_action(user_session, create_policy,"createPolicy") \ No newline at end of file