diff --git a/.github/workflows/airflow-plugin.yml b/.github/workflows/airflow-plugin.yml index 63bab821cc398b..d0c0f52781b9af 100644 --- a/.github/workflows/airflow-plugin.yml +++ b/.github/workflows/airflow-plugin.yml @@ -10,9 +10,9 @@ on: - "metadata-models/**" pull_request: branches: - - master + - "**" paths: - - ".github/**" + - ".github/workflows/airflow-plugin.yml" - "metadata-ingestion-modules/airflow-plugin/**" - "metadata-ingestion/**" - "metadata-models/**" @@ -32,16 +32,21 @@ jobs: strategy: matrix: include: - - python-version: "3.7" - extraPythonRequirement: "apache-airflow~=2.1.0" - - python-version: "3.7" - extraPythonRequirement: "apache-airflow~=2.2.0" + - python-version: "3.8" + extra_pip_requirements: "apache-airflow~=2.1.4" + extra_pip_extras: plugin-v1 + - python-version: "3.8" + extra_pip_requirements: "apache-airflow~=2.2.4" + extra_pip_extras: plugin-v1 - python-version: "3.10" - extraPythonRequirement: "apache-airflow~=2.4.0" + extra_pip_requirements: "apache-airflow~=2.4.0" + extra_pip_extras: plugin-v2 - python-version: "3.10" - extraPythonRequirement: "apache-airflow~=2.6.0" + extra_pip_requirements: "apache-airflow~=2.6.0" + extra_pip_extras: plugin-v2 - python-version: "3.10" - extraPythonRequirement: "apache-airflow>2.6.0" + extra_pip_requirements: "apache-airflow>=2.7.0" + extra_pip_extras: plugin-v2 fail-fast: false steps: - uses: actions/checkout@v3 @@ -51,13 +56,13 @@ jobs: cache: "pip" - name: Install dependencies run: ./metadata-ingestion/scripts/install_deps.sh - - name: Install airflow package and test (extras ${{ matrix.extraPythonRequirement }}) - run: ./gradlew -Pextra_pip_requirements='${{ matrix.extraPythonRequirement }}' :metadata-ingestion-modules:airflow-plugin:lint :metadata-ingestion-modules:airflow-plugin:testQuick + - name: Install airflow package and test (extras ${{ matrix.extra_pip_requirements }}) + run: ./gradlew -Pextra_pip_requirements='${{ matrix.extra_pip_requirements }}' -Pextra_pip_extras='${{ matrix.extra_pip_extras }}' :metadata-ingestion-modules:airflow-plugin:lint :metadata-ingestion-modules:airflow-plugin:testQuick - name: pip freeze show list installed if: always() run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && pip freeze - uses: actions/upload-artifact@v3 - if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'apache-airflow>2.6.0' }} + if: ${{ always() && matrix.python-version == '3.10' && matrix.extra_pip_requirements == 'apache-airflow>=2.7.0' }} with: name: Test Results (Airflow Plugin ${{ matrix.python-version}}) path: | diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml index f6320e1bd5c9fc..25f3957e8f0861 100644 --- a/.github/workflows/build-and-test.yml +++ b/.github/workflows/build-and-test.yml @@ -8,7 +8,7 @@ on: - "**.md" pull_request: branches: - - master + - "**" paths-ignore: - "docs/**" - "**.md" @@ -24,17 +24,12 @@ jobs: strategy: fail-fast: false matrix: - command: - [ - "./gradlew build -x :metadata-ingestion:build -x :metadata-ingestion:check -x docs-website:build -x :metadata-integration:java:spark-lineage:test -x :metadata-io:test -x :metadata-ingestion-modules:airflow-plugin:build -x :datahub-frontend:build -x :datahub-web-react:build --parallel", - "./gradlew :datahub-frontend:build :datahub-web-react:build --parallel", - "./gradlew :metadata-ingestion-modules:airflow-plugin:build --parallel" - ] - timezone: - [ - "UTC", - "America/New_York", + command: [ + # metadata-ingestion and airflow-plugin each have dedicated build jobs + "except_metadata_ingestion", + "frontend" ] + timezone: ["UTC", "America/New_York"] runs-on: ubuntu-latest timeout-minutes: 60 steps: @@ -51,10 +46,17 @@ jobs: java-version: 11 - uses: actions/setup-python@v4 with: - python-version: "3.7" - - name: Gradle build (and test) + python-version: "3.10" + cache: pip + - name: Gradle build (and test) for metadata ingestion + # we only need the timezone runs for frontend tests + if: ${{ matrix.command == 'except_metadata_ingestion' && matrix.timezone == 'America/New_York' }} + run: | + ./gradlew build -x :metadata-ingestion:build -x :metadata-ingestion:check -x docs-website:build -x :metadata-integration:java:spark-lineage:test -x :metadata-io:test -x :metadata-ingestion-modules:airflow-plugin:build -x :metadata-ingestion-modules:airflow-plugin:check -x :datahub-frontend:build -x :datahub-web-react:build --parallel + - name: Gradle build (and test) for frontend + if: ${{ matrix.command == 'frontend' }} run: | - ${{ matrix.command }} + ./gradlew :datahub-frontend:build :datahub-web-react:build --parallel env: NODE_OPTIONS: "--max-old-space-size=3072" - uses: actions/upload-artifact@v3 @@ -81,7 +83,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-python@v4 with: - python-version: "3.7" + python-version: "3.10" - name: Download YQ uses: chrisdickinson/setup-yq@v1.0.1 with: diff --git a/.github/workflows/check-datahub-jars.yml b/.github/workflows/check-datahub-jars.yml index 841a9ed5f9bc73..41f9ea91a94e20 100644 --- a/.github/workflows/check-datahub-jars.yml +++ b/.github/workflows/check-datahub-jars.yml @@ -4,17 +4,13 @@ on: push: branches: - master - paths-ignore: - - "docker/**" - - "docs/**" - - "**.md" + paths: + - "metadata-integration" pull_request: branches: - - master - paths-ignore: - - "docker/**" - - "docs/**" - - "**.md" + - "**" + paths: + - "metadata-integration" release: types: [published] @@ -28,12 +24,7 @@ jobs: max-parallel: 1 fail-fast: false matrix: - command: - [ - "datahub-client", - "datahub-protobuf", - "spark-lineage" - ] + command: ["datahub-client", "datahub-protobuf", "spark-lineage"] runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 diff --git a/.github/workflows/close-stale-issues.yml b/.github/workflows/close-stale-issues.yml index a7809087702acb..98e3041f288040 100644 --- a/.github/workflows/close-stale-issues.yml +++ b/.github/workflows/close-stale-issues.yml @@ -18,7 +18,9 @@ jobs: days-before-issue-stale: 30 days-before-issue-close: 30 stale-issue-label: "stale" - stale-issue-message: "This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://slack.datahubproject.io. For feature requests please use https://feature-requests.datahubproject.io" + stale-issue-message: + "This issue is stale because it has been open for 30 days with no activity. If you believe this is still an issue on the latest DataHub release please leave a comment with the version that you tested it with. If this is a question/discussion please head to https://slack.datahubproject.io.\ + \ For feature requests please use https://feature-requests.datahubproject.io" close-issue-message: "This issue was closed because it has been inactive for 30 days since being marked as stale." days-before-pr-stale: -1 days-before-pr-close: -1 diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 6ce19a5b4616ec..e12971b8a62084 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -10,7 +10,7 @@ on: - ".github/workflows/code-checks.yml" pull_request: branches: - - master + - "**" paths: - "metadata-io/**" - "datahub-web-react/**" @@ -21,17 +21,12 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} cancel-in-progress: true - jobs: code_check: strategy: fail-fast: false matrix: - command: - [ - "check_event_type.py", - "check_policies.py" - ] + command: ["check_event_type.py", "check_policies.py"] name: run code checks runs-on: ubuntu-latest steps: @@ -43,5 +38,5 @@ jobs: with: python-version: "3.10" - name: run check ${{ matrix.command }} - run: | - python .github/scripts/${{ matrix.command }} \ No newline at end of file + run: |- + python .github/scripts/${{ matrix.command }} diff --git a/.github/workflows/docker-postgres-setup.yml b/.github/workflows/docker-postgres-setup.yml index a5d421d4b7ff56..fda4349f90bf7c 100644 --- a/.github/workflows/docker-postgres-setup.yml +++ b/.github/workflows/docker-postgres-setup.yml @@ -8,7 +8,7 @@ on: - ".github/workflows/docker-postgres-setup.yml" pull_request: branches: - - master + - "**" paths: - "docker/postgres-setup/**" - ".github/workflows/docker-postgres-setup.yml" @@ -61,4 +61,3 @@ jobs: context: . file: ./docker/postgres-setup/Dockerfile platforms: linux/amd64,linux/arm64 - diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml index 2aae6bf51529db..5f5a62de6288c8 100644 --- a/.github/workflows/docker-unified.yml +++ b/.github/workflows/docker-unified.yml @@ -8,7 +8,7 @@ on: - "**.md" pull_request: branches: - - master + - "**" paths-ignore: - "docs/**" - "**.md" @@ -545,7 +545,6 @@ jobs: id: tag run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT - datahub_ingestion_slim_build: name: Build and Push DataHub Ingestion Docker Images runs-on: ubuntu-latest @@ -809,8 +808,8 @@ jobs: DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }} DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_IMAGE }} ACTIONS_VERSION: ${{ needs.datahub_ingestion_slim_build.outputs.tag }} - ACTIONS_EXTRA_PACKAGES: 'acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5' - ACTIONS_CONFIG: 'https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml' + ACTIONS_EXTRA_PACKAGES: "acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5" + ACTIONS_CONFIG: "https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml" run: | ./smoke-test/run-quickstart.sh - name: sleep 60s @@ -852,8 +851,14 @@ jobs: if: failure() run: | docker ps -a - docker logs datahub-gms >& gms-${{ matrix.test_strategy }}.log - docker logs datahub-actions >& actions-${{ matrix.test_strategy }}.log + docker logs datahub-gms >& gms-${{ matrix.test_strategy }}.log || true + docker logs datahub-actions >& actions-${{ matrix.test_strategy }}.log || true + docker logs datahub-mae-consumer >& mae-${{ matrix.test_strategy }}.log || true + docker logs datahub-mce-consumer >& mce-${{ matrix.test_strategy }}.log || true + docker logs broker >& broker-${{ matrix.test_strategy }}.log || true + docker logs mysql >& mysql-${{ matrix.test_strategy }}.log || true + docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true + docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true - name: Upload logs uses: actions/upload-artifact@v3 if: failure() diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml index 68432a4feb13dd..c94282938120e4 100644 --- a/.github/workflows/documentation.yml +++ b/.github/workflows/documentation.yml @@ -3,10 +3,18 @@ name: documentation on: pull_request: branches: - - master + - "**" + paths: + - "metadata-ingestion/**" + - "metadata-models/**" + - "docs-website/**" push: branches: - master + paths: + - "metadata-ingestion/**" + - "metadata-models/**" + - "docs-website/**" # release: # types: [published, edited] diff --git a/.github/workflows/lint-actions.yml b/.github/workflows/lint-actions.yml index b285e46da48575..6f34bf292bf51a 100644 --- a/.github/workflows/lint-actions.yml +++ b/.github/workflows/lint-actions.yml @@ -2,8 +2,10 @@ name: Lint actions on: pull_request: paths: - - '.github/workflows/**' + - ".github/workflows/**" + branches: + - "**" jobs: actionlint: runs-on: ubuntu-latest diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 8d56a0adf5bd57..ec6bd4141cc6fc 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -9,9 +9,9 @@ on: - "metadata-models/**" pull_request: branches: - - master + - "**" paths: - - ".github/**" + - ".github/workflows/metadata-ingestion.yml" - "metadata-ingestion/**" - "metadata-models/**" release: @@ -34,7 +34,6 @@ jobs: python-version: ["3.7", "3.10"] command: [ - "lint", "testQuick", "testIntegrationBatch0", "testIntegrationBatch1", @@ -54,6 +53,9 @@ jobs: run: ./metadata-ingestion/scripts/install_deps.sh - name: Install package run: ./gradlew :metadata-ingestion:installPackageOnly + - name: Run lint alongwith testQuick + if: ${{ matrix.command == 'testQuick' }} + run: ./gradlew :metadata-ingestion:lint - name: Run metadata-ingestion tests run: ./gradlew :metadata-ingestion:${{ matrix.command }} - name: Debug info @@ -65,7 +67,6 @@ jobs: docker image ls docker system df - uses: actions/upload-artifact@v3 - if: ${{ always() && matrix.command != 'lint' }} with: name: Test Results (metadata ingestion ${{ matrix.python-version }}) path: | @@ -73,7 +74,7 @@ jobs: **/build/test-results/test/** **/junit.*.xml - name: Upload coverage to Codecov - if: ${{ always() && matrix.python-version == '3.10' && matrix.command != 'lint' }} + if: ${{ always() && matrix.python-version == '3.10' }} uses: codecov/codecov-action@v3 with: token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/metadata-io.yml b/.github/workflows/metadata-io.yml index e37ddd0ce4e86f..48f230ce14c8db 100644 --- a/.github/workflows/metadata-io.yml +++ b/.github/workflows/metadata-io.yml @@ -10,7 +10,7 @@ on: - "metadata-io/**" pull_request: branches: - - master + - "**" paths: - "**/*.gradle" - "li-utils/**" diff --git a/.github/workflows/metadata-model.yml b/.github/workflows/metadata-model.yml index 9d54c88eee591f..4bae5ccc9a266d 100644 --- a/.github/workflows/metadata-model.yml +++ b/.github/workflows/metadata-model.yml @@ -3,9 +3,8 @@ on: push: branches: - master - paths-ignore: - - "docs/**" - - "**.md" + paths: + - "metadata-models/**" release: types: [published] diff --git a/.github/workflows/spark-smoke-test.yml b/.github/workflows/spark-smoke-test.yml index b2482602e75480..541b2019b93ef1 100644 --- a/.github/workflows/spark-smoke-test.yml +++ b/.github/workflows/spark-smoke-test.yml @@ -12,7 +12,7 @@ on: - ".github/workflows/spark-smoke-test.yml" pull_request: branches: - - master + - "**" paths: - "metadata_models/**" - "metadata-integration/java/datahub-client/**" diff --git a/build.gradle b/build.gradle index 025c588da2b523..bd282535fa13cd 100644 --- a/build.gradle +++ b/build.gradle @@ -27,7 +27,7 @@ buildscript { dependencies { classpath 'com.linkedin.pegasus:gradle-plugins:' + pegasusVersion classpath 'com.github.node-gradle:gradle-node-plugin:2.2.4' - classpath 'io.acryl.gradle.plugin:gradle-avro-plugin:0.8.1' + classpath 'io.acryl.gradle.plugin:gradle-avro-plugin:0.2.0' classpath 'org.springframework.boot:spring-boot-gradle-plugin:' + springBootVersion classpath "io.codearte.gradle.nexus:gradle-nexus-staging-plugin:0.30.0" classpath "com.palantir.gradle.gitversion:gradle-git-version:3.0.0" @@ -39,7 +39,7 @@ buildscript { plugins { id 'com.gorylenko.gradle-git-properties' version '2.4.0-rc2' id 'com.github.johnrengelman.shadow' version '6.1.0' - id 'com.palantir.docker' version '0.35.0' + id 'com.palantir.docker' version '0.35.0' apply false // https://blog.ltgt.net/javax-jakarta-mess-and-gradle-solution/ // TODO id "org.gradlex.java-ecosystem-capabilities" version "1.0" } @@ -67,8 +67,8 @@ project.ext.externalDependency = [ 'antlr4Runtime': 'org.antlr:antlr4-runtime:4.7.2', 'antlr4': 'org.antlr:antlr4:4.7.2', 'assertJ': 'org.assertj:assertj-core:3.11.1', - 'avro_1_7': 'org.apache.avro:avro:1.7.7', - 'avroCompiler_1_7': 'org.apache.avro:avro-compiler:1.7.7', + 'avro': 'org.apache.avro:avro:1.11.3', + 'avroCompiler': 'org.apache.avro:avro-compiler:1.11.3', 'awsGlueSchemaRegistrySerde': 'software.amazon.glue:schema-registry-serde:1.1.10', 'awsMskIamAuth': 'software.amazon.msk:aws-msk-iam-auth:1.1.1', 'awsSecretsManagerJdbc': 'com.amazonaws.secretsmanager:aws-secretsmanager-jdbc:1.0.8', @@ -127,7 +127,6 @@ project.ext.externalDependency = [ 'jgrapht': 'org.jgrapht:jgrapht-core:1.5.1', 'jna': 'net.java.dev.jna:jna:5.12.1', 'jsonPatch': 'com.github.java-json-tools:json-patch:1.13', - 'jsonSchemaAvro': 'com.github.fge:json-schema-avro:0.1.4', 'jsonSimple': 'com.googlecode.json-simple:json-simple:1.1.1', 'jsonSmart': 'net.minidev:json-smart:2.4.9', 'json': 'org.json:json:20230227', diff --git a/buildSrc/build.gradle b/buildSrc/build.gradle index 65b3780431db9d..1f9d30d520171b 100644 --- a/buildSrc/build.gradle +++ b/buildSrc/build.gradle @@ -5,7 +5,14 @@ buildscript { } dependencies { - implementation('io.acryl:json-schema-avro:0.1.5') { + /** + * Forked version of abandoned repository: https://github.com/fge/json-schema-avro + * Maintainer last active 2014, we maintain an active fork of this repository to utilize mapping Avro schemas to Json Schemas, + * repository is as close to official library for this as you can get. Original maintainer is one of the authors of Json Schema spec. + * Other companies are also separately maintaining forks (like: https://github.com/java-json-tools/json-schema-avro). + * We have built several customizations on top of it for various bug fixes, especially around union scheams + */ + implementation('io.acryl:json-schema-avro:0.2.2') { exclude group: 'com.fasterxml.jackson.core', module: 'jackson-databind' exclude group: 'com.google.guava', module: 'guava' } diff --git a/datahub-frontend/app/auth/AuthModule.java b/datahub-frontend/app/auth/AuthModule.java index 98f3b82285edaf..fe04c3629fe582 100644 --- a/datahub-frontend/app/auth/AuthModule.java +++ b/datahub-frontend/app/auth/AuthModule.java @@ -56,7 +56,7 @@ public class AuthModule extends AbstractModule { * Pac4j Stores Session State in a browser-side cookie in encrypted fashion. This configuration * value provides a stable encryption base from which to derive the encryption key. * - * We hash this value (SHA1), then take the first 16 bytes as the AES key. + * We hash this value (SHA256), then take the first 16 bytes as the AES key. */ private static final String PAC4J_AES_KEY_BASE_CONF = "play.http.secret.key"; private static final String PAC4J_SESSIONSTORE_PROVIDER_CONF = "pac4j.sessionStore.provider"; @@ -93,7 +93,7 @@ protected void configure() { // it to hex and slice the first 16 bytes, because AES key length must strictly // have a specific length. final String aesKeyBase = _configs.getString(PAC4J_AES_KEY_BASE_CONF); - final String aesKeyHash = DigestUtils.sha1Hex(aesKeyBase.getBytes(StandardCharsets.UTF_8)); + final String aesKeyHash = DigestUtils.sha256Hex(aesKeyBase.getBytes(StandardCharsets.UTF_8)); final String aesEncryptionKey = aesKeyHash.substring(0, 16); playCacheCookieStore = new PlayCookieSessionStore( new ShiroAesDataEncrypter(aesEncryptionKey.getBytes())); diff --git a/datahub-frontend/app/auth/AuthUtils.java b/datahub-frontend/app/auth/AuthUtils.java index 80bd631d0db703..386eee725c83d0 100644 --- a/datahub-frontend/app/auth/AuthUtils.java +++ b/datahub-frontend/app/auth/AuthUtils.java @@ -41,6 +41,11 @@ public class AuthUtils { */ public static final String SYSTEM_CLIENT_SECRET_CONFIG_PATH = "systemClientSecret"; + /** + * Cookie name for redirect url that is manually separated from the session to reduce size + */ + public static final String REDIRECT_URL_COOKIE_NAME = "REDIRECT_URL"; + public static final CorpuserUrn DEFAULT_ACTOR_URN = new CorpuserUrn("datahub"); public static final String LOGIN_ROUTE = "/login"; @@ -77,7 +82,9 @@ public static boolean isEligibleForForwarding(Http.Request req) { * as well as their agreement to determine authentication status. */ public static boolean hasValidSessionCookie(final Http.Request req) { - return req.session().data().containsKey(ACTOR) + Map sessionCookie = req.session().data(); + return sessionCookie.containsKey(ACCESS_TOKEN) + && sessionCookie.containsKey(ACTOR) && req.getCookie(ACTOR).isPresent() && req.session().data().get(ACTOR).equals(req.getCookie(ACTOR).get().value()); } diff --git a/datahub-frontend/app/auth/cookie/CustomCookiesModule.java b/datahub-frontend/app/auth/cookie/CustomCookiesModule.java new file mode 100644 index 00000000000000..a6dbd69a938893 --- /dev/null +++ b/datahub-frontend/app/auth/cookie/CustomCookiesModule.java @@ -0,0 +1,22 @@ +package auth.cookie; + +import com.google.inject.AbstractModule; +import play.api.libs.crypto.CookieSigner; +import play.api.libs.crypto.CookieSignerProvider; +import play.api.mvc.DefaultFlashCookieBaker; +import play.api.mvc.FlashCookieBaker; +import play.api.mvc.SessionCookieBaker; + + +public class CustomCookiesModule extends AbstractModule { + + @Override + public void configure() { + bind(CookieSigner.class).toProvider(CookieSignerProvider.class); + // We override the session cookie baker to not use a fallback, this prevents using an old URL Encoded cookie + bind(SessionCookieBaker.class).to(CustomSessionCookieBaker.class); + // We don't care about flash cookies, we don't use them + bind(FlashCookieBaker.class).to(DefaultFlashCookieBaker.class); + } + +} diff --git a/datahub-frontend/app/auth/cookie/CustomSessionCookieBaker.scala b/datahub-frontend/app/auth/cookie/CustomSessionCookieBaker.scala new file mode 100644 index 00000000000000..6f0a6604fa64bf --- /dev/null +++ b/datahub-frontend/app/auth/cookie/CustomSessionCookieBaker.scala @@ -0,0 +1,25 @@ +package auth.cookie + +import com.google.inject.Inject +import play.api.http.{SecretConfiguration, SessionConfiguration} +import play.api.libs.crypto.CookieSigner +import play.api.mvc.DefaultSessionCookieBaker + +import scala.collection.immutable.Map + +/** + * Overrides default fallback to URL Encoding behavior, prevents usage of old URL encoded session cookies + * @param config + * @param secretConfiguration + * @param cookieSigner + */ +class CustomSessionCookieBaker @Inject() ( + override val config: SessionConfiguration, + override val secretConfiguration: SecretConfiguration, + cookieSigner: CookieSigner +) extends DefaultSessionCookieBaker(config, secretConfiguration, cookieSigner) { + // Has to be a Scala class because it extends a trait with concrete implementations, Scala does compilation tricks + + // Forces use of jwt encoding and disallows fallback to legacy url encoding + override def decode(encodedData: String): Map[String, String] = jwtCodec.decode(encodedData) +} diff --git a/datahub-frontend/app/auth/sso/oidc/OidcAuthorizationGenerator.java b/datahub-frontend/app/auth/sso/oidc/OidcAuthorizationGenerator.java index 3f864ed5abddfa..baca144610ec4c 100644 --- a/datahub-frontend/app/auth/sso/oidc/OidcAuthorizationGenerator.java +++ b/datahub-frontend/app/auth/sso/oidc/OidcAuthorizationGenerator.java @@ -1,19 +1,9 @@ package auth.sso.oidc; -import java.text.ParseException; import java.util.Map.Entry; import java.util.Optional; -import com.nimbusds.jose.Algorithm; -import com.nimbusds.jose.Header; -import com.nimbusds.jose.JWEAlgorithm; -import com.nimbusds.jose.JWSAlgorithm; -import com.nimbusds.jose.util.Base64URL; -import com.nimbusds.jose.util.JSONObjectUtils; -import com.nimbusds.jwt.EncryptedJWT; import com.nimbusds.jwt.JWTParser; -import com.nimbusds.jwt.SignedJWT; -import net.minidev.json.JSONObject; import org.pac4j.core.authorization.generator.AuthorizationGenerator; import org.pac4j.core.context.WebContext; import org.pac4j.core.profile.AttributeLocation; @@ -63,32 +53,5 @@ public Optional generate(WebContext context, UserProfile profile) { return Optional.ofNullable(profile); } - - private static JWT parse(final String s) throws ParseException { - final int firstDotPos = s.indexOf("."); - - if (firstDotPos == -1) { - throw new ParseException("Invalid JWT serialization: Missing dot delimiter(s)", 0); - } - - Base64URL header = new Base64URL(s.substring(0, firstDotPos)); - JSONObject jsonObject; - - try { - jsonObject = JSONObjectUtils.parse(header.decodeToString()); - } catch (ParseException e) { - throw new ParseException("Invalid unsecured/JWS/JWE header: " + e.getMessage(), 0); - } - - Algorithm alg = Header.parseAlgorithm(jsonObject); - - if (alg instanceof JWSAlgorithm) { - return SignedJWT.parse(s); - } else if (alg instanceof JWEAlgorithm) { - return EncryptedJWT.parse(s); - } else { - throw new AssertionError("Unexpected algorithm type: " + alg); - } - } } diff --git a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java index 4bde0872fc082c..7164710f4e0ded 100644 --- a/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java +++ b/datahub-frontend/app/auth/sso/oidc/OidcCallbackLogic.java @@ -38,6 +38,7 @@ import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; +import java.util.Base64; import java.util.Collection; import java.util.Collections; import java.util.List; @@ -49,19 +50,21 @@ import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import org.pac4j.core.config.Config; +import org.pac4j.core.context.Cookie; import org.pac4j.core.engine.DefaultCallbackLogic; import org.pac4j.core.http.adapter.HttpActionAdapter; import org.pac4j.core.profile.CommonProfile; import org.pac4j.core.profile.ProfileManager; import org.pac4j.core.profile.UserProfile; +import org.pac4j.core.util.Pac4jConstants; import org.pac4j.play.PlayWebContext; import play.mvc.Result; import auth.sso.SsoManager; -import static auth.AuthUtils.createActorCookie; -import static auth.AuthUtils.createSessionMap; +import static auth.AuthUtils.*; import static com.linkedin.metadata.Constants.CORP_USER_ENTITY_NAME; import static com.linkedin.metadata.Constants.GROUP_MEMBERSHIP_ASPECT_NAME; +import static org.pac4j.play.store.PlayCookieSessionStore.*; import static play.mvc.Results.internalServerError; @@ -97,6 +100,9 @@ public OidcCallbackLogic(final SsoManager ssoManager, final Authentication syste public Result perform(PlayWebContext context, Config config, HttpActionAdapter httpActionAdapter, String defaultUrl, Boolean saveInSession, Boolean multiProfile, Boolean renewSession, String defaultClient) { + + setContextRedirectUrl(context); + final Result result = super.perform(context, config, httpActionAdapter, defaultUrl, saveInSession, multiProfile, renewSession, defaultClient); @@ -111,6 +117,15 @@ public Result perform(PlayWebContext context, Config config, return handleOidcCallback(oidcConfigs, result, context, getProfileManager(context)); } + @SuppressWarnings("unchecked") + private void setContextRedirectUrl(PlayWebContext context) { + Optional redirectUrl = context.getRequestCookies().stream() + .filter(cookie -> REDIRECT_URL_COOKIE_NAME.equals(cookie.getName())).findFirst(); + redirectUrl.ifPresent( + cookie -> context.getSessionStore().set(context, Pac4jConstants.REQUESTED_URL, + JAVA_SER_HELPER.deserializeFromBytes(uncompressBytes(Base64.getDecoder().decode(cookie.getValue()))))); + } + private Result handleOidcCallback(final OidcConfigs oidcConfigs, final Result result, final PlayWebContext context, final ProfileManager profileManager) { diff --git a/datahub-frontend/app/client/KafkaTrackingProducer.java b/datahub-frontend/app/client/KafkaTrackingProducer.java index fab17f9215d4a2..59e91a6d5a0f7f 100644 --- a/datahub-frontend/app/client/KafkaTrackingProducer.java +++ b/datahub-frontend/app/client/KafkaTrackingProducer.java @@ -1,6 +1,8 @@ package client; +import com.linkedin.metadata.config.kafka.ProducerConfiguration; import com.typesafe.config.Config; +import config.ConfigurationProvider; import org.apache.kafka.clients.CommonClientConfigs; import org.apache.kafka.clients.producer.KafkaProducer; import org.apache.kafka.clients.producer.ProducerConfig; @@ -35,12 +37,12 @@ public class KafkaTrackingProducer { private final KafkaProducer _producer; @Inject - public KafkaTrackingProducer(@Nonnull Config config, ApplicationLifecycle lifecycle) { + public KafkaTrackingProducer(@Nonnull Config config, ApplicationLifecycle lifecycle, final ConfigurationProvider configurationProvider) { _isEnabled = !config.hasPath("analytics.enabled") || config.getBoolean("analytics.enabled"); if (_isEnabled) { _logger.debug("Analytics tracking is enabled"); - _producer = createKafkaProducer(config); + _producer = createKafkaProducer(config, configurationProvider.getKafka().getProducer()); lifecycle.addStopHook( () -> { @@ -62,13 +64,15 @@ public void send(ProducerRecord record) { _producer.send(record); } - private static KafkaProducer createKafkaProducer(Config config) { + private static KafkaProducer createKafkaProducer(Config config, ProducerConfiguration producerConfiguration) { final Properties props = new Properties(); props.put(ProducerConfig.CLIENT_ID_CONFIG, "datahub-frontend"); props.put(ProducerConfig.DELIVERY_TIMEOUT_MS_CONFIG, config.getString("analytics.kafka.delivery.timeout.ms")); props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, config.getString("analytics.kafka.bootstrap.server")); props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); // Actor urn. props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, "org.apache.kafka.common.serialization.StringSerializer"); // JSON object. + props.put(ProducerConfig.MAX_REQUEST_SIZE_CONFIG, producerConfiguration.getMaxRequestSize()); + props.put(ProducerConfig.COMPRESSION_TYPE_CONFIG, producerConfiguration.getCompressionType()); final String securityProtocolConfig = "analytics.kafka.security.protocol"; if (config.hasPath(securityProtocolConfig) diff --git a/datahub-frontend/app/config/ConfigurationProvider.java b/datahub-frontend/app/config/ConfigurationProvider.java index 00a5472ec34763..8f526c831b5c9b 100644 --- a/datahub-frontend/app/config/ConfigurationProvider.java +++ b/datahub-frontend/app/config/ConfigurationProvider.java @@ -1,6 +1,7 @@ package config; import com.linkedin.metadata.config.cache.CacheConfiguration; +import com.linkedin.metadata.config.kafka.KafkaConfiguration; import com.linkedin.metadata.spring.YamlPropertySourceFactory; import lombok.Data; @@ -11,7 +12,6 @@ /** * Minimal sharing between metadata-service and frontend - * Initially for use of client caching configuration. * Does not use the factories module to avoid transitive dependencies. */ @EnableConfigurationProperties @@ -19,6 +19,10 @@ @ConfigurationProperties @Data public class ConfigurationProvider { + /** + * Kafka related configs. + */ + private KafkaConfiguration kafka; /** * Configuration for caching diff --git a/datahub-frontend/app/controllers/AuthenticationController.java b/datahub-frontend/app/controllers/AuthenticationController.java index e9ddfb2611ceba..4f89f4f67e1499 100644 --- a/datahub-frontend/app/controllers/AuthenticationController.java +++ b/datahub-frontend/app/controllers/AuthenticationController.java @@ -13,14 +13,15 @@ import com.typesafe.config.Config; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; +import java.util.Base64; import java.util.Optional; import javax.annotation.Nonnull; import javax.inject.Inject; import org.apache.commons.lang3.StringUtils; import org.pac4j.core.client.Client; +import org.pac4j.core.context.Cookie; import org.pac4j.core.exception.http.FoundAction; import org.pac4j.core.exception.http.RedirectionAction; -import org.pac4j.core.util.Pac4jConstants; import org.pac4j.play.PlayWebContext; import org.pac4j.play.http.PlayHttpActionAdapter; import org.pac4j.play.store.PlaySessionStore; @@ -33,18 +34,9 @@ import play.mvc.Results; import security.AuthenticationManager; -import static auth.AuthUtils.DEFAULT_ACTOR_URN; -import static auth.AuthUtils.EMAIL; -import static auth.AuthUtils.FULL_NAME; -import static auth.AuthUtils.INVITE_TOKEN; -import static auth.AuthUtils.LOGIN_ROUTE; -import static auth.AuthUtils.PASSWORD; -import static auth.AuthUtils.RESET_TOKEN; -import static auth.AuthUtils.TITLE; -import static auth.AuthUtils.USER_NAME; -import static auth.AuthUtils.createActorCookie; -import static auth.AuthUtils.createSessionMap; +import static auth.AuthUtils.*; import static org.pac4j.core.client.IndirectClient.ATTEMPTED_AUTHENTICATION_SUFFIX; +import static org.pac4j.play.store.PlayCookieSessionStore.*; // TODO add logging. @@ -297,8 +289,12 @@ private Optional redirectToIdentityProvider(Http.RequestHeader request, } private void configurePac4jSessionStore(PlayWebContext context, Client client, String redirectPath) { - // Set the originally requested path for post-auth redirection. - _playSessionStore.set(context, Pac4jConstants.REQUESTED_URL, new FoundAction(redirectPath)); + // Set the originally requested path for post-auth redirection. We split off into a separate cookie from the session + // to reduce size of the session cookie + FoundAction foundAction = new FoundAction(redirectPath); + byte[] javaSerBytes = JAVA_SER_HELPER.serializeToBytes(foundAction); + String serialized = Base64.getEncoder().encodeToString(compressBytes(javaSerBytes)); + context.addResponseCookie(new Cookie(REDIRECT_URL_COOKIE_NAME, serialized)); // This is to prevent previous login attempts from being cached. // We replicate the logic here, which is buried in the Pac4j client. if (_playSessionStore.get(context, client.getName() + ATTEMPTED_AUTHENTICATION_SUFFIX) != null) { diff --git a/datahub-frontend/conf/application.conf b/datahub-frontend/conf/application.conf index 18d901d5ee7ddc..1a62c8547e721f 100644 --- a/datahub-frontend/conf/application.conf +++ b/datahub-frontend/conf/application.conf @@ -22,11 +22,16 @@ play.application.loader = play.inject.guice.GuiceApplicationLoader play.http.parser.maxMemoryBuffer = 10MB play.http.parser.maxMemoryBuffer = ${?DATAHUB_PLAY_MEM_BUFFER_SIZE} -# TODO: Disable legacy URL encoding eventually +play.modules.disabled += "play.api.mvc.LegacyCookiesModule" play.modules.disabled += "play.api.mvc.CookiesModule" -play.modules.enabled += "play.api.mvc.LegacyCookiesModule" +play.modules.enabled += "auth.cookie.CustomCookiesModule" play.modules.enabled += "auth.AuthModule" +jwt { + # 'alg' https://tools.ietf.org/html/rfc7515#section-4.1.1 + signatureAlgorithm = "HS256" +} + # We override the Akka server provider to allow setting the max header count to a higher value # This is useful while using proxies like Envoy that result in the frontend server rejecting GMS # responses as there's more than the max of 64 allowed headers @@ -199,10 +204,14 @@ auth.native.enabled = ${?AUTH_NATIVE_ENABLED} # auth.native.enabled = false # auth.oidc.enabled = false # (or simply omit oidc configurations) -# Login session expiration time +# Login session expiration time, controls when the actor cookie is expired on the browser side auth.session.ttlInHours = 24 auth.session.ttlInHours = ${?AUTH_SESSION_TTL_HOURS} +# Control the length of time a session token is valid +play.http.session.maxAge = 24h +play.http.session.maxAge = ${?MAX_SESSION_TOKEN_AGE} + analytics.enabled = true analytics.enabled = ${?DATAHUB_ANALYTICS_ENABLED} diff --git a/datahub-frontend/test/app/ApplicationTest.java b/datahub-frontend/test/app/ApplicationTest.java index 417fd79e76bbd9..f27fefdb796691 100644 --- a/datahub-frontend/test/app/ApplicationTest.java +++ b/datahub-frontend/test/app/ApplicationTest.java @@ -1,6 +1,11 @@ package app; +import com.nimbusds.jwt.JWT; +import com.nimbusds.jwt.JWTClaimsSet; +import com.nimbusds.jwt.JWTParser; import controllers.routes; +import java.text.ParseException; +import java.util.Date; import no.nav.security.mock.oauth2.MockOAuth2Server; import no.nav.security.mock.oauth2.token.DefaultOAuth2TokenCallback; import okhttp3.mockwebserver.MockResponse; @@ -27,8 +32,6 @@ import java.io.IOException; import java.net.InetAddress; -import java.net.URLEncoder; -import java.nio.charset.StandardCharsets; import java.util.List; import java.util.Map; @@ -149,7 +152,7 @@ public void testOpenIdConfig() { } @Test - public void testHappyPathOidc() throws InterruptedException { + public void testHappyPathOidc() throws ParseException { browser.goTo("/authenticate"); assertEquals("", browser.url()); @@ -157,8 +160,23 @@ public void testHappyPathOidc() throws InterruptedException { assertEquals(TEST_USER, actorCookie.getValue()); Cookie sessionCookie = browser.getCookie("PLAY_SESSION"); - assertTrue(sessionCookie.getValue().contains("token=" + TEST_TOKEN)); - assertTrue(sessionCookie.getValue().contains("actor=" + URLEncoder.encode(TEST_USER, StandardCharsets.UTF_8))); + String jwtStr = sessionCookie.getValue(); + JWT jwt = JWTParser.parse(jwtStr); + JWTClaimsSet claims = jwt.getJWTClaimsSet(); + Map data = (Map) claims.getClaim("data"); + assertEquals(TEST_TOKEN, data.get("token")); + assertEquals(TEST_USER, data.get("actor")); + // Default expiration is 24h, so should always be less than current time + 1 day since it stamps the time before this executes + assertTrue(claims.getExpirationTime().compareTo(new Date(System.currentTimeMillis() + (24 * 60 * 60 * 1000))) < 0); + } + + @Test + public void testAPI() throws ParseException { + testHappyPathOidc(); + int requestCount = _gmsServer.getRequestCount(); + + browser.goTo("/api/v2/graphql/"); + assertEquals(++requestCount, _gmsServer.getRequestCount()); } @Test diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java index 3ba0cc1f747e30..b99f712034fe03 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/GmsGraphQLEngine.java @@ -821,6 +821,7 @@ private void configureQueryResolvers(final RuntimeWiring.Builder builder) { .dataFetcher("glossaryNode", getResolver(glossaryNodeType)) .dataFetcher("domain", getResolver((domainType))) .dataFetcher("dataPlatform", getResolver(dataPlatformType)) + .dataFetcher("dataPlatformInstance", getResolver(dataPlatformInstanceType)) .dataFetcher("mlFeatureTable", getResolver(mlFeatureTableType)) .dataFetcher("mlFeature", getResolver(mlFeatureType)) .dataFetcher("mlPrimaryKey", getResolver(mlPrimaryKeyType)) @@ -1291,7 +1292,8 @@ private void configureCorpUserResolvers(final RuntimeWiring.Builder builder) { */ private void configureCorpGroupResolvers(final RuntimeWiring.Builder builder) { builder.type("CorpGroup", typeWiring -> typeWiring - .dataFetcher("relationships", new EntityRelationshipsResultResolver(graphClient))); + .dataFetcher("relationships", new EntityRelationshipsResultResolver(graphClient)) + .dataFetcher("exists", new EntityExistsResolver(entityService))); builder.type("CorpGroupInfo", typeWiring -> typeWiring .dataFetcher("admins", new LoadableTypeBatchResolver<>(corpUserType, diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java index 3089b8c8fc2dba..03e63c7fb472fa 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/authorization/AuthorizationUtils.java @@ -4,7 +4,7 @@ import com.datahub.plugins.auth.authorization.Authorizer; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.Urn; @@ -90,7 +90,7 @@ public static boolean canManageTags(@Nonnull QueryContext context) { } public static boolean canDeleteEntity(@Nonnull Urn entityUrn, @Nonnull QueryContext context) { - return isAuthorized(context, Optional.of(new ResourceSpec(entityUrn.getEntityType(), entityUrn.toString())), PoliciesConfig.DELETE_ENTITY_PRIVILEGE); + return isAuthorized(context, Optional.of(new EntitySpec(entityUrn.getEntityType(), entityUrn.toString())), PoliciesConfig.DELETE_ENTITY_PRIVILEGE); } public static boolean canManageUserCredentials(@Nonnull QueryContext context) { @@ -173,7 +173,7 @@ public static boolean canDeleteQuery(@Nonnull Urn entityUrn, @Nonnull List public static boolean isAuthorized( @Nonnull QueryContext context, - @Nonnull Optional resourceSpec, + @Nonnull Optional resourceSpec, @Nonnull PoliciesConfig.Privilege privilege) { final Authorizer authorizer = context.getAuthorizer(); final String actor = context.getActorUrn(); @@ -196,7 +196,7 @@ public static boolean isAuthorized( @Nonnull String resource, @Nonnull DisjunctivePrivilegeGroup privilegeGroup ) { - final ResourceSpec resourceSpec = new ResourceSpec(resourceType, resource); + final EntitySpec resourceSpec = new EntitySpec(resourceType, resource); return AuthUtil.isAuthorized(authorizer, actor, Optional.of(resourceSpec), privilegeGroup); } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java index 23be49c7e7140b..2873866bb34f73 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetStatsSummaryResolver.java @@ -1,6 +1,6 @@ package com.linkedin.datahub.graphql.resolvers.dataset; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; import com.linkedin.common.urn.Urn; @@ -104,7 +104,7 @@ private CorpUser createPartialUser(final Urn userUrn) { private boolean isAuthorized(final Urn resourceUrn, final QueryContext context) { return AuthorizationUtils.isAuthorized(context, - Optional.of(new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString())), + Optional.of(new EntitySpec(resourceUrn.getEntityType(), resourceUrn.toString())), PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE); } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java index 20361830ad5a54..e4bec8e896fdf7 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/dataset/DatasetUsageStatsResolver.java @@ -1,6 +1,6 @@ package com.linkedin.datahub.graphql.resolvers.dataset; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; @@ -52,7 +52,7 @@ public CompletableFuture get(DataFetchingEnvironment environme private boolean isAuthorized(final Urn resourceUrn, final QueryContext context) { return AuthorizationUtils.isAuthorized(context, - Optional.of(new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString())), + Optional.of(new EntitySpec(resourceUrn.getEntityType(), resourceUrn.toString())), PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE); } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/IngestionResolverUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/IngestionResolverUtils.java index 7db0b6f826a044..1140c031f1d355 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/IngestionResolverUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/IngestionResolverUtils.java @@ -5,6 +5,7 @@ import com.linkedin.datahub.graphql.generated.IngestionConfig; import com.linkedin.datahub.graphql.generated.IngestionSchedule; import com.linkedin.datahub.graphql.generated.IngestionSource; +import com.linkedin.datahub.graphql.generated.StringMapEntry; import com.linkedin.datahub.graphql.generated.StructuredReport; import com.linkedin.datahub.graphql.types.common.mappers.StringMapMapper; import com.linkedin.entity.EntityResponse; @@ -21,6 +22,7 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; @@ -143,6 +145,14 @@ public static IngestionConfig mapIngestionSourceConfig(final DataHubIngestionSou result.setVersion(config.getVersion()); result.setExecutorId(config.getExecutorId()); result.setDebugMode(config.isDebugMode()); + if (config.getExtraArgs() != null) { + List extraArgs = config.getExtraArgs() + .keySet() + .stream() + .map(key -> new StringMapEntry(key, config.getExtraArgs().get(key))) + .collect(Collectors.toList()); + result.setExtraArgs(extraArgs); + } return result; } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/execution/CreateIngestionExecutionRequestResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/execution/CreateIngestionExecutionRequestResolver.java index e5064e6620526f..ea20b837e0a1f6 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/execution/CreateIngestionExecutionRequestResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/execution/CreateIngestionExecutionRequestResolver.java @@ -117,6 +117,9 @@ public CompletableFuture get(final DataFetchingEnvironment environment) if (ingestionSourceInfo.getConfig().hasDebugMode()) { debugMode = ingestionSourceInfo.getConfig().isDebugMode() ? "true" : "false"; } + if (ingestionSourceInfo.getConfig().hasExtraArgs()) { + arguments.putAll(ingestionSourceInfo.getConfig().getExtraArgs()); + } arguments.put(DEBUG_MODE_ARG_NAME, debugMode); execInput.setArgs(new StringMap(arguments)); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolver.java index 2ce394ad5ba848..68e334bd976f8e 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolver.java @@ -1,10 +1,12 @@ package com.linkedin.datahub.graphql.resolvers.ingest.source; import com.linkedin.common.urn.Urn; +import com.linkedin.data.template.StringMap; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.exception.AuthorizationException; import com.linkedin.datahub.graphql.exception.DataHubGraphQLErrorCode; import com.linkedin.datahub.graphql.exception.DataHubGraphQLException; +import com.linkedin.datahub.graphql.generated.StringMapEntryInput; import com.linkedin.datahub.graphql.generated.UpdateIngestionSourceConfigInput; import com.linkedin.datahub.graphql.generated.UpdateIngestionSourceInput; import com.linkedin.datahub.graphql.generated.UpdateIngestionSourceScheduleInput; @@ -17,6 +19,8 @@ import com.linkedin.mxe.MetadataChangeProposal; import graphql.schema.DataFetcher; import graphql.schema.DataFetchingEnvironment; +import java.util.Map; +import java.util.stream.Collectors; import lombok.extern.slf4j.Slf4j; import java.net.URISyntaxException; @@ -108,6 +112,12 @@ private DataHubIngestionSourceConfig mapConfig(final UpdateIngestionSourceConfig if (input.getDebugMode() != null) { result.setDebugMode(input.getDebugMode()); } + if (input.getExtraArgs() != null) { + Map extraArgs = input.getExtraArgs() + .stream() + .collect(Collectors.toMap(StringMapEntryInput::getKey, StringMapEntryInput::getValue)); + result.setExtraArgs(new StringMap(extraArgs)); + } return result; } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java index 197ca8640559dd..f13ebf8373e91a 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/load/TimeSeriesAspectResolver.java @@ -1,6 +1,6 @@ package com.linkedin.datahub.graphql.resolvers.load; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.authorization.AuthorizationUtils; import com.linkedin.datahub.graphql.generated.Entity; @@ -79,7 +79,7 @@ public TimeSeriesAspectResolver( private boolean isAuthorized(QueryContext context, String urn) { if (_entityName.equals(Constants.DATASET_ENTITY_NAME) && _aspectName.equals( Constants.DATASET_PROFILE_ASPECT_NAME)) { - return AuthorizationUtils.isAuthorized(context, Optional.of(new ResourceSpec(_entityName, urn)), + return AuthorizationUtils.isAuthorized(context, Optional.of(new EntitySpec(_entityName, urn)), PoliciesConfig.VIEW_DATASET_PROFILE_PRIVILEGE); } return true; diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnerResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnerResolver.java index 5ca7007d98e43c..3f2dab0a5ba711 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnerResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnerResolver.java @@ -2,14 +2,11 @@ import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.CorpuserUrn; - import com.linkedin.common.urn.Urn; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.exception.AuthorizationException; import com.linkedin.datahub.graphql.generated.AddOwnerInput; -import com.linkedin.datahub.graphql.generated.OwnerEntityType; import com.linkedin.datahub.graphql.generated.OwnerInput; -import com.linkedin.datahub.graphql.generated.OwnershipType; import com.linkedin.datahub.graphql.generated.ResourceRefInput; import com.linkedin.datahub.graphql.resolvers.mutate.util.OwnerUtils; import com.linkedin.metadata.entity.EntityService; @@ -20,7 +17,6 @@ import lombok.extern.slf4j.Slf4j; import static com.linkedin.datahub.graphql.resolvers.ResolverUtils.*; -import static com.linkedin.datahub.graphql.resolvers.mutate.util.OwnerUtils.*; @Slf4j @@ -32,30 +28,33 @@ public class AddOwnerResolver implements DataFetcher> @Override public CompletableFuture get(DataFetchingEnvironment environment) throws Exception { final AddOwnerInput input = bindArgument(environment.getArgument("input"), AddOwnerInput.class); - Urn ownerUrn = Urn.createFromString(input.getOwnerUrn()); - OwnerEntityType ownerEntityType = input.getOwnerEntityType(); - OwnershipType type = input.getType() == null ? OwnershipType.NONE : input.getType(); - String ownershipUrn = input.getOwnershipTypeUrn() == null ? mapOwnershipTypeToEntity(type.name()) : input.getOwnershipTypeUrn(); Urn targetUrn = Urn.createFromString(input.getResourceUrn()); + OwnerInput.Builder ownerInputBuilder = OwnerInput.builder(); + ownerInputBuilder.setOwnerUrn(input.getOwnerUrn()); + ownerInputBuilder.setOwnerEntityType(input.getOwnerEntityType()); + if (input.getType() != null) { + ownerInputBuilder.setType(input.getType()); + } + if (input.getOwnershipTypeUrn() != null) { + ownerInputBuilder.setOwnershipTypeUrn(input.getOwnershipTypeUrn()); + } + OwnerInput ownerInput = ownerInputBuilder.build(); if (!OwnerUtils.isAuthorizedToUpdateOwners(environment.getContext(), targetUrn)) { throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator."); } return CompletableFuture.supplyAsync(() -> { - OwnerUtils.validateAddInput( - ownerUrn, input.getOwnershipTypeUrn(), ownerEntityType, - targetUrn, - _entityService - ); + OwnerUtils.validateAddOwnerInput(ownerInput, ownerUrn, _entityService); + try { log.debug("Adding Owner. input: {}", input); Urn actor = CorpuserUrn.createFromString(((QueryContext) environment.getContext()).getActorUrn()); OwnerUtils.addOwnersToResources( - ImmutableList.of(new OwnerInput(input.getOwnerUrn(), ownerEntityType, type, ownershipUrn)), + ImmutableList.of(ownerInput), ImmutableList.of(new ResourceRefInput(input.getResourceUrn(), null, null)), actor, _entityService diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnersResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnersResolver.java index 06424efa83819f..4e5b5bdb2a651d 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnersResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/AddOwnersResolver.java @@ -39,7 +39,7 @@ public CompletableFuture get(DataFetchingEnvironment environment) throw throw new AuthorizationException("Unauthorized to perform this action. Please contact your DataHub administrator."); } - OwnerUtils.validateAddInput( + OwnerUtils.validateAddOwnerInput( owners, targetUrn, _entityService diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/BatchAddOwnersResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/BatchAddOwnersResolver.java index 019c044d81ab32..5beaeecae673f0 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/BatchAddOwnersResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/BatchAddOwnersResolver.java @@ -53,8 +53,7 @@ public CompletableFuture get(DataFetchingEnvironment environment) throw private void validateOwners(List owners) { for (OwnerInput ownerInput : owners) { - OwnerUtils.validateOwner(UrnUtils.getUrn(ownerInput.getOwnerUrn()), ownerInput.getOwnerEntityType(), - UrnUtils.getUrn(ownerInput.getOwnershipTypeUrn()), _entityService); + OwnerUtils.validateOwner(ownerInput, _entityService); } } diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java index d2f7f896e59532..72339958044231 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/mutate/util/OwnerUtils.java @@ -50,7 +50,7 @@ public static void addOwnersToResources( ) { final List changes = new ArrayList<>(); for (ResourceRefInput resource : resources) { - changes.add(buildAddOwnersProposal(owners, UrnUtils.getUrn(resource.getResourceUrn()), actor, entityService)); + changes.add(buildAddOwnersProposal(owners, UrnUtils.getUrn(resource.getResourceUrn()), entityService)); } EntityUtils.ingestChangeProposals(changes, entityService, actor, false); } @@ -69,7 +69,7 @@ public static void removeOwnersFromResources( } - private static MetadataChangeProposal buildAddOwnersProposal(List owners, Urn resourceUrn, Urn actor, EntityService entityService) { + static MetadataChangeProposal buildAddOwnersProposal(List owners, Urn resourceUrn, EntityService entityService) { Ownership ownershipAspect = (Ownership) EntityUtils.getAspectFromEntity( resourceUrn.toString(), Constants.OWNERSHIP_ASPECT_NAME, entityService, @@ -181,18 +181,13 @@ public static boolean isAuthorizedToUpdateOwners(@Nonnull QueryContext context, orPrivilegeGroups); } - public static Boolean validateAddInput( + public static Boolean validateAddOwnerInput( List owners, Urn resourceUrn, EntityService entityService ) { for (OwnerInput owner : owners) { - boolean result = validateAddInput( - UrnUtils.getUrn(owner.getOwnerUrn()), - owner.getOwnershipTypeUrn(), - owner.getOwnerEntityType(), - resourceUrn, - entityService); + boolean result = validateAddOwnerInput(owner, resourceUrn, entityService); if (!result) { return false; } @@ -200,44 +195,29 @@ public static Boolean validateAddInput( return true; } - public static Boolean validateAddInput( - Urn ownerUrn, - String ownershipEntityUrn, - OwnerEntityType ownerEntityType, + public static Boolean validateAddOwnerInput( + OwnerInput owner, Urn resourceUrn, EntityService entityService ) { - if (OwnerEntityType.CORP_GROUP.equals(ownerEntityType) && !Constants.CORP_GROUP_ENTITY_NAME.equals(ownerUrn.getEntityType())) { - throw new IllegalArgumentException(String.format("Failed to change ownership for resource %s. Expected a corp group urn.", resourceUrn)); - } - - if (OwnerEntityType.CORP_USER.equals(ownerEntityType) && !Constants.CORP_USER_ENTITY_NAME.equals(ownerUrn.getEntityType())) { - throw new IllegalArgumentException(String.format("Failed to change ownership for resource %s. Expected a corp user urn.", resourceUrn)); - } - if (!entityService.exists(resourceUrn)) { throw new IllegalArgumentException(String.format("Failed to change ownership for resource %s. Resource does not exist.", resourceUrn)); } - if (!entityService.exists(ownerUrn)) { - throw new IllegalArgumentException(String.format("Failed to change ownership for resource %s. Owner %s does not exist.", resourceUrn, ownerUrn)); - } - - if (ownershipEntityUrn != null && !entityService.exists(UrnUtils.getUrn(ownershipEntityUrn))) { - throw new IllegalArgumentException(String.format("Failed to change ownership type for resource %s. Ownership Type " - + "%s does not exist.", resourceUrn, ownershipEntityUrn)); - } + validateOwner(owner, entityService); return true; } public static void validateOwner( - Urn ownerUrn, - OwnerEntityType ownerEntityType, - Urn ownershipEntityUrn, + OwnerInput owner, EntityService entityService ) { + + OwnerEntityType ownerEntityType = owner.getOwnerEntityType(); + Urn ownerUrn = UrnUtils.getUrn(owner.getOwnerUrn()); + if (OwnerEntityType.CORP_GROUP.equals(ownerEntityType) && !Constants.CORP_GROUP_ENTITY_NAME.equals(ownerUrn.getEntityType())) { throw new IllegalArgumentException( String.format("Failed to change ownership for resource(s). Expected a corp group urn, found %s", ownerUrn)); @@ -252,9 +232,14 @@ public static void validateOwner( throw new IllegalArgumentException(String.format("Failed to change ownership for resource(s). Owner with urn %s does not exist.", ownerUrn)); } - if (!entityService.exists(ownershipEntityUrn)) { - throw new IllegalArgumentException(String.format("Failed to change ownership for resource(s). Ownership type with " - + "urn %s does not exist.", ownershipEntityUrn)); + if (owner.getOwnershipTypeUrn() != null && !entityService.exists(UrnUtils.getUrn(owner.getOwnershipTypeUrn()))) { + throw new IllegalArgumentException(String.format("Failed to change ownership for resource(s). Custom Ownership type with " + + "urn %s does not exist.", owner.getOwnershipTypeUrn())); + } + + if (owner.getType() == null && owner.getOwnershipTypeUrn() == null) { + throw new IllegalArgumentException("Failed to change ownership for resource(s). Expected either " + + "type or ownershipTypeUrn to be specified."); } } @@ -269,11 +254,11 @@ public static Boolean validateRemoveInput( } public static void addCreatorAsOwner( - QueryContext context, - String urn, - OwnerEntityType ownerEntityType, - OwnershipType ownershipType, - EntityService entityService) { + QueryContext context, + String urn, + OwnerEntityType ownerEntityType, + OwnershipType ownershipType, + EntityService entityService) { try { Urn actorUrn = CorpuserUrn.createFromString(context.getActorUrn()); String ownershipTypeUrn = mapOwnershipTypeToEntity(ownershipType.name()); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java index 2f20fdaf1e9b1b..11f7793db82c8b 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/resolvers/policy/GetGrantedPrivilegesResolver.java @@ -2,7 +2,7 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.DataHubAuthorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.datahub.graphql.QueryContext; import com.linkedin.datahub.graphql.exception.AuthorizationException; import com.linkedin.datahub.graphql.generated.GetGrantedPrivilegesInput; @@ -33,8 +33,8 @@ public CompletableFuture get(final DataFetchingEnvironment environme if (!isAuthorized(context, actor)) { throw new AuthorizationException("Unauthorized to get privileges for the given author."); } - final Optional resourceSpec = Optional.ofNullable(input.getResourceSpec()) - .map(spec -> new ResourceSpec(EntityTypeMapper.getName(spec.getResourceType()), spec.getResourceUrn())); + final Optional resourceSpec = Optional.ofNullable(input.getResourceSpec()) + .map(spec -> new EntitySpec(EntityTypeMapper.getName(spec.getResourceType()), spec.getResourceUrn())); if (context.getAuthorizer() instanceof AuthorizerChain) { DataHubAuthorizer dataHubAuthorizer = ((AuthorizerChain) context.getAuthorizer()).getDefaultAuthorizer(); diff --git a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatforminstance/DataPlatformInstanceType.java b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatforminstance/DataPlatformInstanceType.java index 2423fc31ea52e3..87614e13325283 100644 --- a/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatforminstance/DataPlatformInstanceType.java +++ b/datahub-graphql-core/src/main/java/com/linkedin/datahub/graphql/types/dataplatforminstance/DataPlatformInstanceType.java @@ -4,16 +4,25 @@ import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; +import com.linkedin.datahub.graphql.generated.AutoCompleteResults; import com.linkedin.datahub.graphql.generated.DataPlatformInstance; import com.linkedin.datahub.graphql.generated.Entity; import com.linkedin.datahub.graphql.generated.EntityType; +import com.linkedin.datahub.graphql.generated.FacetFilterInput; +import com.linkedin.datahub.graphql.generated.SearchResults; import com.linkedin.datahub.graphql.types.dataplatforminstance.mappers.DataPlatformInstanceMapper; +import com.linkedin.datahub.graphql.types.mappers.AutoCompleteResultsMapper; +import com.linkedin.datahub.graphql.types.SearchableEntityType; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.Constants; +import com.linkedin.metadata.query.AutoCompleteResult; +import com.linkedin.metadata.query.filter.Filter; import graphql.execution.DataFetcherResult; +import org.apache.commons.lang3.NotImplementedException; import javax.annotation.Nonnull; +import javax.annotation.Nullable; import java.util.ArrayList; import java.util.HashSet; import java.util.List; @@ -22,7 +31,10 @@ import java.util.function.Function; import java.util.stream.Collectors; -public class DataPlatformInstanceType implements com.linkedin.datahub.graphql.types.EntityType { +import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ENTITY_NAME; + +public class DataPlatformInstanceType implements SearchableEntityType, + com.linkedin.datahub.graphql.types.EntityType { static final Set ASPECTS_TO_FETCH = ImmutableSet.of( Constants.DATA_PLATFORM_INSTANCE_KEY_ASPECT_NAME, @@ -84,4 +96,24 @@ public List> batchLoad(@Nonnull List filters, + int start, + int count, + @Nonnull final QueryContext context) throws Exception { + throw new NotImplementedException("Searchable type (deprecated) not implemented on DataPlatformInstance entity type"); + } + + @Override + public AutoCompleteResults autoComplete(@Nonnull String query, + @Nullable String field, + @Nullable Filter filters, + int limit, + @Nonnull final QueryContext context) throws Exception { + final AutoCompleteResult result = _entityClient.autoComplete(DATA_PLATFORM_INSTANCE_ENTITY_NAME, query, + filters, limit, context.getAuthentication()); + return AutoCompleteResultsMapper.map(result); + } + } diff --git a/datahub-graphql-core/src/main/resources/entity.graphql b/datahub-graphql-core/src/main/resources/entity.graphql index 39f86948c77c40..b37a8f34fa0563 100644 --- a/datahub-graphql-core/src/main/resources/entity.graphql +++ b/datahub-graphql-core/src/main/resources/entity.graphql @@ -226,6 +226,11 @@ type Query { listOwnershipTypes( "Input required for listing custom ownership types" input: ListOwnershipTypesInput!): ListOwnershipTypesResult! + + """ + Fetch a Data Platform Instance by primary key (urn) + """ + dataPlatformInstance(urn: String!): DataPlatformInstance } """ @@ -3783,6 +3788,11 @@ type CorpGroup implements Entity { Additional read only info about the group """ info: CorpGroupInfo @deprecated + + """ + Whether or not this entity exists on DataHub + """ + exists: Boolean } """ diff --git a/datahub-graphql-core/src/main/resources/ingestion.graphql b/datahub-graphql-core/src/main/resources/ingestion.graphql index 69c8aff124583c..21f9fb2633119b 100644 --- a/datahub-graphql-core/src/main/resources/ingestion.graphql +++ b/datahub-graphql-core/src/main/resources/ingestion.graphql @@ -332,6 +332,11 @@ type IngestionConfig { Advanced: Whether or not to run ingestion in debug mode """ debugMode: Boolean + + """ + Advanced: Extra arguments for the ingestion run. + """ + extraArgs: [StringMapEntry!] } """ @@ -483,6 +488,11 @@ input UpdateIngestionSourceConfigInput { Whether or not to run ingestion in debug mode """ debugMode: Boolean + + """ + Extra arguments for the ingestion run. + """ + extraArgs: [StringMapEntryInput!] } """ diff --git a/datahub-graphql-core/src/main/resources/search.graphql b/datahub-graphql-core/src/main/resources/search.graphql index 4cabdb04afe77c..e0cde5a2db9f99 100644 --- a/datahub-graphql-core/src/main/resources/search.graphql +++ b/datahub-graphql-core/src/main/resources/search.graphql @@ -458,6 +458,26 @@ enum FilterOperator { Represents the relation: The field exists. If the field is an array, the field is either not present or empty. """ EXISTS + + """ + Represent the relation greater than, e.g. ownerCount > 5 + """ + GREATER_THAN + + """ + Represent the relation greater than or equal to, e.g. ownerCount >= 5 + """ + GREATER_THAN_OR_EQUAL_TO + + """ + Represent the relation less than, e.g. ownerCount < 3 + """ + LESS_THAN + + """ + Represent the relation less than or equal to, e.g. ownerCount <= 3 + """ + LESS_THAN_OR_EQUAL_TO } """ diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/TestUtils.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/TestUtils.java index 272a93fa1989c9..606123cac926de 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/TestUtils.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/TestUtils.java @@ -8,6 +8,7 @@ import com.datahub.plugins.auth.authorization.Authorizer; import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.UrnUtils; +import com.linkedin.data.schema.annotation.PathSpecBasedSchemaAnnotationVisitor; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.ebean.transactions.AspectsBatchImpl; import com.linkedin.metadata.models.registry.ConfigEntityRegistry; @@ -21,6 +22,8 @@ public class TestUtils { public static EntityService getMockEntityService() { + PathSpecBasedSchemaAnnotationVisitor.class.getClassLoader() + .setClassAssertionStatus(PathSpecBasedSchemaAnnotationVisitor.class.getName(), false); EntityRegistry registry = new ConfigEntityRegistry(TestUtils.class.getResourceAsStream("/test-entity-registry.yaml")); EntityService mockEntityService = Mockito.mock(EntityService.class); Mockito.when(mockEntityService.getEntityRegistry()).thenReturn(registry); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/GlossaryUtilsTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/GlossaryUtilsTest.java index ccaab44f60dd40..8bfc32e1999ae2 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/GlossaryUtilsTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/glossary/GlossaryUtilsTest.java @@ -5,7 +5,7 @@ import com.datahub.authorization.AuthorizationRequest; import com.datahub.authorization.AuthorizationResult; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.common.urn.GlossaryNodeUrn; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; @@ -89,17 +89,17 @@ private void setUpTests() throws Exception { Mockito.any(Authentication.class) )).thenReturn(new EntityResponse().setAspects(new EnvelopedAspectMap(parentNode3Aspects))); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec3); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec2); - final ResourceSpec resourceSpec1 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); + final EntitySpec resourceSpec1 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec1); } - private void mockAuthRequest(String privilege, AuthorizationResult.Type allowOrDeny, ResourceSpec resourceSpec) { + private void mockAuthRequest(String privilege, AuthorizationResult.Type allowOrDeny, EntitySpec resourceSpec) { final AuthorizationRequest authorizationRequest = new AuthorizationRequest( userUrn, privilege, @@ -150,7 +150,7 @@ public void testCanManageChildrenEntitiesAuthorized() throws Exception { // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn.toString()); + final EntitySpec resourceSpec = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.ALLOW, resourceSpec); assertTrue(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn, mockClient)); @@ -162,7 +162,7 @@ public void testCanManageChildrenEntitiesUnauthorized() throws Exception { // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn.toString()); + final EntitySpec resourceSpec = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn.toString()); mockAuthRequest("MANAGE_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec); @@ -175,13 +175,13 @@ public void testCanManageChildrenRecursivelyEntitiesAuthorized() throws Exceptio // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.ALLOW, resourceSpec3); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec2); - final ResourceSpec resourceSpec1 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); + final EntitySpec resourceSpec1 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec1); assertTrue(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn1, mockClient)); @@ -193,13 +193,13 @@ public void testCanManageChildrenRecursivelyEntitiesUnauthorized() throws Except // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec3); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec2); - final ResourceSpec resourceSpec1 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); + final EntitySpec resourceSpec1 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec1); assertFalse(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn1, mockClient)); @@ -211,10 +211,10 @@ public void testCanManageChildrenRecursivelyEntitiesAuthorizedLevel2() throws Ex // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.ALLOW, resourceSpec2); - final ResourceSpec resourceSpec1 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); + final EntitySpec resourceSpec1 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn1.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec1); assertTrue(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn1, mockClient)); @@ -226,10 +226,10 @@ public void testCanManageChildrenRecursivelyEntitiesUnauthorizedLevel2() throws // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec3); - final ResourceSpec resourceSpec2 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); + final EntitySpec resourceSpec2 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn2.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec2); assertFalse(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn2, mockClient)); @@ -241,7 +241,7 @@ public void testCanManageChildrenRecursivelyEntitiesNoLevel2() throws Exception // they do NOT have the MANAGE_GLOSSARIES platform privilege mockAuthRequest("MANAGE_GLOSSARIES", AuthorizationResult.Type.DENY, null); - final ResourceSpec resourceSpec3 = new ResourceSpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); + final EntitySpec resourceSpec3 = new EntitySpec(parentNodeUrn.getEntityType(), parentNodeUrn3.toString()); mockAuthRequest("MANAGE_ALL_GLOSSARY_CHILDREN", AuthorizationResult.Type.DENY, resourceSpec3); assertFalse(GlossaryUtils.canManageChildrenEntities(mockContext, parentNodeUrn3, mockClient)); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolverTest.java index 2538accc694fba..16d8da9169a8fc 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/ingest/source/UpsertIngestionSourceResolverTest.java @@ -26,7 +26,7 @@ public class UpsertIngestionSourceResolverTest { "Test source", "mysql", "Test source description", new UpdateIngestionSourceScheduleInput("* * * * *", "UTC"), - new UpdateIngestionSourceConfigInput("my test recipe", "0.8.18", "executor id", false) + new UpdateIngestionSourceConfigInput("my test recipe", "0.8.18", "executor id", false, null) ); @Test diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/owner/AddOwnersResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/owner/AddOwnersResolverTest.java index efc0c5dfcf36d8..329d71ec125db0 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/owner/AddOwnersResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/owner/AddOwnersResolverTest.java @@ -2,6 +2,11 @@ import com.google.common.collect.ImmutableList; import com.linkedin.common.AuditStamp; +import com.linkedin.common.Owner; +import com.linkedin.common.OwnerArray; +import com.linkedin.common.Ownership; +import com.linkedin.common.OwnershipSource; +import com.linkedin.common.OwnershipSourceType; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.datahub.graphql.QueryContext; @@ -28,6 +33,7 @@ public class AddOwnersResolverTest { private static final String TEST_ENTITY_URN = "urn:li:dataset:(urn:li:dataPlatform:mysql,my-test,PROD)"; private static final String TEST_OWNER_1_URN = "urn:li:corpuser:test-id-1"; private static final String TEST_OWNER_2_URN = "urn:li:corpuser:test-id-2"; + private static final String TEST_OWNER_3_URN = "urn:li:corpGroup:test-id-3"; @Test public void testGetSuccessNoExistingOwners() throws Exception { @@ -75,33 +81,41 @@ public void testGetSuccessNoExistingOwners() throws Exception { } @Test - public void testGetSuccessExistingOwners() throws Exception { + public void testGetSuccessExistingOwnerNewType() throws Exception { EntityService mockService = getMockEntityService(); + com.linkedin.common.Ownership oldOwnership = new Ownership().setOwners(new OwnerArray( + ImmutableList.of(new Owner() + .setOwner(UrnUtils.getUrn(TEST_OWNER_1_URN)) + .setType(com.linkedin.common.OwnershipType.NONE) + .setSource(new OwnershipSource().setType(OwnershipSourceType.MANUAL)) + ))); + Mockito.when(mockService.getAspect( - Mockito.eq(UrnUtils.getUrn(TEST_ENTITY_URN)), - Mockito.eq(Constants.OWNERSHIP_ASPECT_NAME), - Mockito.eq(0L))) - .thenReturn(null); + Mockito.eq(UrnUtils.getUrn(TEST_ENTITY_URN)), + Mockito.eq(Constants.OWNERSHIP_ASPECT_NAME), + Mockito.eq(0L))) + .thenReturn(oldOwnership); Mockito.when(mockService.exists(Urn.createFromString(TEST_ENTITY_URN))).thenReturn(true); Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_1_URN))).thenReturn(true); - Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_2_URN))).thenReturn(true); Mockito.when(mockService.exists(Urn.createFromString( - OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.TECHNICAL_OWNER.name())))) - .thenReturn(true); + OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.TECHNICAL_OWNER.name())))) + .thenReturn(true); AddOwnersResolver resolver = new AddOwnersResolver(mockService); // Execute resolver QueryContext mockContext = getMockAllowContext(); DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + AddOwnersInput input = new AddOwnersInput(ImmutableList.of( - new OwnerInput(TEST_OWNER_1_URN, OwnerEntityType.CORP_USER, OwnershipType.TECHNICAL_OWNER, - OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())), - new OwnerInput(TEST_OWNER_2_URN, OwnerEntityType.CORP_USER, OwnershipType.TECHNICAL_OWNER, - OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_1_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_USER) + .build() ), TEST_ENTITY_URN); Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); Mockito.when(mockEnv.getContext()).thenReturn(mockContext); @@ -111,11 +125,126 @@ public void testGetSuccessExistingOwners() throws Exception { verifyIngestProposal(mockService, 1); Mockito.verify(mockService, Mockito.times(1)).exists( - Mockito.eq(Urn.createFromString(TEST_OWNER_1_URN)) + Mockito.eq(Urn.createFromString(TEST_OWNER_1_URN)) ); + } + + @Test + public void testGetSuccessDeprecatedTypeToOwnershipType() throws Exception { + EntityService mockService = getMockEntityService(); + + com.linkedin.common.Ownership oldOwnership = new Ownership().setOwners(new OwnerArray( + ImmutableList.of(new Owner() + .setOwner(UrnUtils.getUrn(TEST_OWNER_1_URN)) + .setType(com.linkedin.common.OwnershipType.TECHNICAL_OWNER) + .setSource(new OwnershipSource().setType(OwnershipSourceType.MANUAL)) + ))); + + Mockito.when(mockService.getAspect( + Mockito.eq(UrnUtils.getUrn(TEST_ENTITY_URN)), + Mockito.eq(Constants.OWNERSHIP_ASPECT_NAME), + Mockito.eq(0L))) + .thenReturn(oldOwnership); + + Mockito.when(mockService.exists(Urn.createFromString(TEST_ENTITY_URN))).thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_1_URN))).thenReturn(true); + + Mockito.when(mockService.exists(Urn.createFromString( + OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.TECHNICAL_OWNER.name())))) + .thenReturn(true); + + AddOwnersResolver resolver = new AddOwnersResolver(mockService); + + // Execute resolver + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + + AddOwnersInput input = new AddOwnersInput(ImmutableList.of(OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_1_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_USER) + .build() + ), TEST_ENTITY_URN); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + assertTrue(resolver.get(mockEnv).get()); + + // Unable to easily validate exact payload due to the injected timestamp + verifyIngestProposal(mockService, 1); Mockito.verify(mockService, Mockito.times(1)).exists( - Mockito.eq(Urn.createFromString(TEST_OWNER_2_URN)) + Mockito.eq(Urn.createFromString(TEST_OWNER_1_URN)) + ); + } + + @Test + public void testGetSuccessMultipleOwnerTypes() throws Exception { + EntityService mockService = getMockEntityService(); + + com.linkedin.common.Ownership oldOwnership = new Ownership().setOwners(new OwnerArray( + ImmutableList.of(new Owner() + .setOwner(UrnUtils.getUrn(TEST_OWNER_1_URN)) + .setType(com.linkedin.common.OwnershipType.NONE) + .setSource(new OwnershipSource().setType(OwnershipSourceType.MANUAL)) + ))); + + Mockito.when(mockService.getAspect( + Mockito.eq(UrnUtils.getUrn(TEST_ENTITY_URN)), + Mockito.eq(Constants.OWNERSHIP_ASPECT_NAME), + Mockito.eq(0L))) + .thenReturn(oldOwnership); + + Mockito.when(mockService.exists(Urn.createFromString(TEST_ENTITY_URN))).thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_1_URN))).thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_2_URN))).thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString(TEST_OWNER_3_URN))).thenReturn(true); + + Mockito.when(mockService.exists(Urn.createFromString( + OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.TECHNICAL_OWNER.name())))) + .thenReturn(true); + Mockito.when(mockService.exists(Urn.createFromString( + OwnerUtils.mapOwnershipTypeToEntity(com.linkedin.datahub.graphql.generated.OwnershipType.BUSINESS_OWNER.name())))) + .thenReturn(true); + + AddOwnersResolver resolver = new AddOwnersResolver(mockService); + + // Execute resolver + QueryContext mockContext = getMockAllowContext(); + DataFetchingEnvironment mockEnv = Mockito.mock(DataFetchingEnvironment.class); + + AddOwnersInput input = new AddOwnersInput(ImmutableList.of(OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_1_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_USER) + .build(), + OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_2_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.BUSINESS_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_USER) + .build(), + OwnerInput.builder() + .setOwnerUrn(TEST_OWNER_3_URN) + .setOwnershipTypeUrn(OwnerUtils.mapOwnershipTypeToEntity(OwnershipType.TECHNICAL_OWNER.name())) + .setOwnerEntityType(OwnerEntityType.CORP_GROUP) + .build() + ), TEST_ENTITY_URN); + Mockito.when(mockEnv.getArgument(Mockito.eq("input"))).thenReturn(input); + Mockito.when(mockEnv.getContext()).thenReturn(mockContext); + assertTrue(resolver.get(mockEnv).get()); + + // Unable to easily validate exact payload due to the injected timestamp + verifyIngestProposal(mockService, 1); + + Mockito.verify(mockService, Mockito.times(1)).exists( + Mockito.eq(Urn.createFromString(TEST_OWNER_1_URN)) + ); + + Mockito.verify(mockService, Mockito.times(1)).exists( + Mockito.eq(Urn.createFromString(TEST_OWNER_2_URN)) + ); + + Mockito.verify(mockService, Mockito.times(1)).exists( + Mockito.eq(Urn.createFromString(TEST_OWNER_3_URN)) ); } diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/CreateQueryResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/CreateQueryResolverTest.java index 196eb24b52bf81..9c04c67dd3a3b3 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/CreateQueryResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/CreateQueryResolverTest.java @@ -5,7 +5,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.AuthorizationRequest; import com.datahub.authorization.AuthorizationResult; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -201,7 +201,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_QUERIES_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); @@ -210,7 +210,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/DeleteQueryResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/DeleteQueryResolverTest.java index a6b4887b0e8820..78c894f27cbc3b 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/DeleteQueryResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/DeleteQueryResolverTest.java @@ -5,7 +5,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.AuthorizationRequest; import com.datahub.authorization.AuthorizationResult; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; @@ -134,7 +134,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { DeleteQueryResolverTest.TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_QUERIES_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( DeleteQueryResolverTest.TEST_DATASET_URN.getEntityType(), DeleteQueryResolverTest.TEST_DATASET_URN.toString())) ); @@ -143,7 +143,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); diff --git a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/UpdateQueryResolverTest.java b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/UpdateQueryResolverTest.java index 7a76b6d6be5a43..9b500b5fb39361 100644 --- a/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/UpdateQueryResolverTest.java +++ b/datahub-graphql-core/src/test/java/com/linkedin/datahub/graphql/resolvers/query/UpdateQueryResolverTest.java @@ -5,7 +5,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.AuthorizationRequest; import com.datahub.authorization.AuthorizationResult; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableMap; @@ -206,7 +206,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_QUERIES_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); @@ -215,7 +215,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN.getEntityType(), TEST_DATASET_URN.toString())) ); @@ -224,7 +224,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_QUERIES_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN_2.getEntityType(), TEST_DATASET_URN_2.toString())) ); @@ -233,7 +233,7 @@ private QueryContext getMockQueryContext(boolean allowEditEntityQueries) { TEST_ACTOR_URN.toString(), PoliciesConfig.EDIT_ENTITY_PRIVILEGE.getType(), Optional.of( - new ResourceSpec( + new EntitySpec( TEST_DATASET_URN_2.getEntityType(), TEST_DATASET_URN_2.toString())) ); diff --git a/datahub-graphql-core/src/test/resources/test-entity-registry.yaml b/datahub-graphql-core/src/test/resources/test-entity-registry.yaml index d694ae53ac42f7..efd75a7fb07f51 100644 --- a/datahub-graphql-core/src/test/resources/test-entity-registry.yaml +++ b/datahub-graphql-core/src/test/resources/test-entity-registry.yaml @@ -181,6 +181,7 @@ entities: - assertionInfo - dataPlatformInstance - assertionRunEvent + - assertionActions - status - name: dataHubRetention category: internal @@ -292,4 +293,11 @@ entities: aspects: - ownershipTypeInfo - status +- name: dataContract + category: core + keyAspect: dataContractKey + aspects: + - dataContractProperties + - dataContractStatus + - status events: diff --git a/datahub-web-react/src/app/domain/CreateDomainModal.tsx b/datahub-web-react/src/app/domain/CreateDomainModal.tsx index ca1bc305960035..606444d34bdc97 100644 --- a/datahub-web-react/src/app/domain/CreateDomainModal.tsx +++ b/datahub-web-react/src/app/domain/CreateDomainModal.tsx @@ -191,7 +191,10 @@ export default function CreateDomainModal({ onClose, onCreate }: Props) { rules={[{ whitespace: true }, { min: 1, max: 500 }]} hasFeedback > - + diff --git a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx index 0e5c035df00c10..b69f0c5458b5de 100644 --- a/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx +++ b/datahub-web-react/src/app/domain/nestedDomains/ManageDomainsPageV2.tsx @@ -42,7 +42,12 @@ export default function ManageDomainsPageV2() {
-
diff --git a/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx index 09c8e13853bb70..bf70bd043fd4a1 100644 --- a/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx +++ b/datahub-web-react/src/app/domain/nestedDomains/domainNavigator/DomainNode.tsx @@ -103,7 +103,7 @@ export default function DomainNode({ domain, numDomainChildren, domainUrnToHide, return ( <> - + {hasDomainChildren && ( diff --git a/datahub-web-react/src/app/entity/group/GroupProfile.tsx b/datahub-web-react/src/app/entity/group/GroupProfile.tsx index d5e284af931df3..11ed31e00003f4 100644 --- a/datahub-web-react/src/app/entity/group/GroupProfile.tsx +++ b/datahub-web-react/src/app/entity/group/GroupProfile.tsx @@ -11,11 +11,12 @@ import { RoutedTabs } from '../../shared/RoutedTabs'; import GroupInfoSidebar from './GroupInfoSideBar'; import { GroupAssets } from './GroupAssets'; import { ErrorSection } from '../../shared/error/ErrorSection'; +import NonExistentEntityPage from '../shared/entity/NonExistentEntityPage'; const messageStyle = { marginTop: '10%' }; export enum TabType { - Assets = 'Assets', + Assets = 'Owner Of', Members = 'Members', } @@ -110,6 +111,9 @@ export default function GroupProfile() { urn, }; + if (data?.corpGroup?.exists === false) { + return ; + } return ( <> {error && } diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx index d48ead2f5863e5..9788d36af2c65a 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/CreateGlossaryEntityModal.tsx @@ -112,7 +112,11 @@ function CreateGlossaryEntityModal(props: Props) { - @@ -130,6 +134,7 @@ function CreateGlossaryEntityModal(props: Props) { > Name}> setIsMoveModalVisible(true)} @@ -223,7 +224,7 @@ function EntityDropdown(props: Props) { : undefined } > - +  Delete diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx index cdbf6fdabf3c99..3826f934c1c25e 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveDomainModal.tsx @@ -67,6 +67,7 @@ function MoveDomainModal(props: Props) { return ( Cancel - + } > diff --git a/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveGlossaryEntityModal.tsx b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveGlossaryEntityModal.tsx index 5352825708776a..37a625f58100b3 100644 --- a/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveGlossaryEntityModal.tsx +++ b/datahub-web-react/src/app/entity/shared/EntityDropdown/MoveGlossaryEntityModal.tsx @@ -64,6 +64,7 @@ function MoveGlossaryEntityModal(props: Props) { return ( Cancel - + } > diff --git a/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx b/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx index 579b8c9905da07..cb37c44a36caa7 100644 --- a/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx +++ b/datahub-web-react/src/app/entity/shared/components/legacy/DescriptionModal.tsx @@ -41,7 +41,11 @@ export default function UpdateDescriptionModal({ title, description, original, o footer={ <> - diff --git a/datahub-web-react/src/app/entity/shared/components/styled/AddLinkModal.tsx b/datahub-web-react/src/app/entity/shared/components/styled/AddLinkModal.tsx index 34d4f0cb3fe913..68a8cf40943629 100644 --- a/datahub-web-react/src/app/entity/shared/components/styled/AddLinkModal.tsx +++ b/datahub-web-react/src/app/entity/shared/components/styled/AddLinkModal.tsx @@ -57,7 +57,7 @@ export const AddLinkModal = ({ buttonProps, refetch }: AddLinkProps) => { return ( <> - { , - , ]} >
{ )} diff --git a/datahub-web-react/src/app/entity/shared/tabs/Documentation/DocumentationTab.tsx b/datahub-web-react/src/app/entity/shared/tabs/Documentation/DocumentationTab.tsx index de065d23e56e7c..344c2aef871750 100644 --- a/datahub-web-react/src/app/entity/shared/tabs/Documentation/DocumentationTab.tsx +++ b/datahub-web-react/src/app/entity/shared/tabs/Documentation/DocumentationTab.tsx @@ -60,6 +60,7 @@ export const DocumentationTab = ({ properties }: { properties?: Props }) => {
- diff --git a/datahub-web-react/src/app/entity/user/UserProfile.tsx b/datahub-web-react/src/app/entity/user/UserProfile.tsx index 1d20072c4ea8f5..e8284ba61afe47 100644 --- a/datahub-web-react/src/app/entity/user/UserProfile.tsx +++ b/datahub-web-react/src/app/entity/user/UserProfile.tsx @@ -17,7 +17,7 @@ export interface Props { } export enum TabType { - Assets = 'Assets', + Assets = 'Owner Of', Groups = 'Groups', } const ENABLED_TAB_TYPES = [TabType.Assets, TabType.Groups]; diff --git a/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx b/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx index 11f54cb5078e6e..a5262265fd23d7 100644 --- a/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx +++ b/datahub-web-react/src/app/glossary/BusinessGlossaryPage.tsx @@ -92,11 +92,12 @@ function BusinessGlossaryPage() { {(termsError || nodesError) && ( )} - + Business Glossary
diff --git a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx index 6c91a0f6f3f8f2..13af19b0b6ac29 100644 --- a/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx +++ b/datahub-web-react/src/app/ingest/source/IngestionSourceList.tsx @@ -15,7 +15,7 @@ import { Message } from '../../shared/Message'; import TabToolbar from '../../entity/shared/components/styled/TabToolbar'; import { IngestionSourceBuilderModal } from './builder/IngestionSourceBuilderModal'; import { addToListIngestionSourcesCache, CLI_EXECUTOR_ID, removeFromListIngestionSourcesCache } from './utils'; -import { DEFAULT_EXECUTOR_ID, SourceBuilderState } from './builder/types'; +import { DEFAULT_EXECUTOR_ID, SourceBuilderState, StringMapEntryInput } from './builder/types'; import { IngestionSource, UpdateIngestionSourceInput } from '../../../types.generated'; import { SearchBar } from '../../search/SearchBar'; import { useEntityRegistry } from '../../useEntityRegistry'; @@ -173,6 +173,11 @@ export const IngestionSourceList = () => { setFocusSourceUrn(undefined); }; + const formatExtraArgs = (extraArgs): StringMapEntryInput[] => { + if (extraArgs === null || extraArgs === undefined) return []; + return extraArgs.map((entry) => ({ key: entry.key, value: entry.value })); + }; + const createOrUpdateIngestionSource = ( input: UpdateIngestionSourceInput, resetState: () => void, @@ -294,6 +299,7 @@ export const IngestionSourceList = () => { (recipeBuilderState.config?.executorId as string)) || DEFAULT_EXECUTOR_ID, debugMode: recipeBuilderState.config?.debugMode || false, + extraArgs: formatExtraArgs(recipeBuilderState.config?.extraArgs || []), }, schedule: recipeBuilderState.schedule && { interval: recipeBuilderState.schedule?.interval as string, @@ -358,7 +364,12 @@ export const IngestionSourceList = () => {
- )} diff --git a/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx b/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx index dba9b25e14e99a..7a14b6a7941896 100644 --- a/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/CreateScheduleStep.tsx @@ -167,7 +167,11 @@ export const CreateScheduleStep = ({ state, updateState, goTo, prev }: StepProps
-
diff --git a/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx b/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx index 913f8253ece5af..3092364bb8bdd1 100644 --- a/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/NameSourceStep.tsx @@ -1,7 +1,7 @@ import { Button, Checkbox, Collapse, Form, Input, Typography } from 'antd'; import React from 'react'; import styled from 'styled-components'; -import { SourceBuilderState, StepProps } from './types'; +import { SourceBuilderState, StepProps, StringMapEntryInput } from './types'; const ControlsContainer = styled.div` display: flex; @@ -13,6 +13,10 @@ const SaveButton = styled(Button)` margin-right: 15px; `; +const ExtraEnvKey = 'extra_env_vars'; +const ExtraReqKey = 'extra_pip_requirements'; +const ExtraPluginKey = 'extra_pip_plugins'; + export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps) => { const setName = (stagedName: string) => { const newState: SourceBuilderState = { @@ -55,6 +59,90 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps) updateState(newState); }; + const retrieveExtraEnvs = () => { + const extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const index: number = extraArgs.findIndex((entry) => entry.key === ExtraEnvKey) as number; + if (index > -1) { + return extraArgs[index].value; + } + return ''; + }; + + const setExtraEnvs = (envs: string) => { + let extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const indxOfEnvVars: number = extraArgs.findIndex((entry) => entry.key === ExtraEnvKey) as number; + const value = { key: ExtraEnvKey, value: envs }; + if (indxOfEnvVars > -1) { + extraArgs[indxOfEnvVars] = value; + } else { + extraArgs = [...extraArgs, value]; + } + const newState: SourceBuilderState = { + ...state, + config: { + ...state.config, + extraArgs, + }, + }; + updateState(newState); + }; + + const retrieveExtraDataHubPlugins = () => { + const extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const index: number = extraArgs.findIndex((entry) => entry.key === ExtraPluginKey) as number; + if (index > -1) { + return extraArgs[index].value; + } + return ''; + }; + + const setExtraDataHubPlugins = (plugins: string) => { + let extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const indxOfPlugins: number = extraArgs.findIndex((entry) => entry.key === ExtraPluginKey) as number; + const value = { key: ExtraPluginKey, value: plugins }; + if (indxOfPlugins > -1) { + extraArgs[indxOfPlugins] = value; + } else { + extraArgs = [...extraArgs, value]; + } + const newState: SourceBuilderState = { + ...state, + config: { + ...state.config, + extraArgs, + }, + }; + updateState(newState); + }; + + const retrieveExtraReqs = () => { + const extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const index: number = extraArgs.findIndex((entry) => entry.key === ExtraReqKey) as number; + if (index > -1) { + return extraArgs[index].value; + } + return ''; + }; + + const setExtraReqs = (reqs: string) => { + let extraArgs: StringMapEntryInput[] = state.config?.extraArgs ? state.config?.extraArgs : []; + const indxOfReqs: number = extraArgs.findIndex((entry) => entry.key === ExtraReqKey) as number; + const value = { key: ExtraReqKey, value: reqs }; + if (indxOfReqs > -1) { + extraArgs[indxOfReqs] = value; + } else { + extraArgs = [...extraArgs, value]; + } + const newState: SourceBuilderState = { + ...state, + config: { + ...state.config, + extraArgs, + }, + }; + updateState(newState); + }; + const onClickCreate = (shouldRun?: boolean) => { if (state.name !== undefined && state.name.length > 0) { submit(shouldRun); @@ -102,7 +190,7 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps) setVersion(event.target.value)} /> @@ -116,6 +204,39 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps) onChange={(event) => setDebugMode(event.target.checked)} /> + Extra Enviroment Variables}> + + Advanced: Set extra environment variables to an ingestion execution + + setExtraEnvs(event.target.value)} + /> + + Extra DataHub plugins}> + + Advanced: Set extra DataHub plugins for an ingestion execution + + setExtraDataHubPlugins(event.target.value)} + /> + + Extra Pip Libraries}> + + Advanced: Add extra pip libraries for an ingestion execution + + setExtraReqs(event.target.value)} + /> + @@ -123,6 +244,7 @@ export const NameSourceStep = ({ state, updateState, prev, submit }: StepProps)
0)} onClick={() => onClickCreate(false)} > diff --git a/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx b/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx index 4ddeb7b4925950..bee9b04cee1007 100644 --- a/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx +++ b/datahub-web-react/src/app/ingest/source/builder/RecipeBuilder.tsx @@ -86,10 +86,20 @@ function RecipeBuilder(props: Props) { {sourceConfigs?.displayName} Recipe - switchViews(true)}> + switchViews(true)} + data-testid="recipe-builder-form-button" + > Form - switchViews(false)}> + switchViews(false)} + data-testid="recipe-builder-yaml-button" + > YAML @@ -114,7 +124,9 @@ function RecipeBuilder(props: Props) { - + )} diff --git a/datahub-web-react/src/app/ingest/source/builder/sources.json b/datahub-web-react/src/app/ingest/source/builder/sources.json index 1bd5b6f1f768b5..b18384909c33f0 100644 --- a/datahub-web-react/src/app/ingest/source/builder/sources.json +++ b/datahub-web-react/src/app/ingest/source/builder/sources.json @@ -130,7 +130,7 @@ "name": "dynamodb", "displayName": "DynamoDB", "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/", - "recipe": "source:\n type: dynamodb\n config:\n platform_instance: \"AWS_ACCOUNT_ID\"\n aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n # User could use the below option to provide a list of primary keys of a table in dynamodb format,\n # those items from given primary keys will be included when we scan the table.\n # For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items.\n # We'll enforce the the primary keys list size not to exceed 100\n # The total items we'll try to retrieve in these two scenarios:\n # 1. If user don't specify include_table_item: we'll retrieve up to 100 items\n # 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in\n # the table, with a total not more than 200 items\n # include_table_item:\n # table_name:\n # [\n # {\n # 'partition_key_name': { 'attribute_type': 'attribute_value' },\n # 'sort_key_name': { 'attribute_type': 'attribute_value' },\n # },\n # ]" + "recipe": "source:\n type: dynamodb\n config:\n platform_instance: \"AWS_ACCOUNT_ID\"\n aws_access_key_id : '${AWS_ACCESS_KEY_ID}'\n aws_secret_access_key : '${AWS_SECRET_ACCESS_KEY}'\n # If there are items that have most representative fields of the table, users could use the\n # `include_table_item` option to provide a list of primary keys of the table in dynamodb format.\n # For each `region.table`, the list of primary keys can be at most 100.\n # We include these items in addition to the first 100 items in the table when we scan it.\n # include_table_item:\n # region.table_name:\n # [\n # {\n # 'partition_key_name': { 'attribute_type': 'attribute_value' },\n # 'sort_key_name': { 'attribute_type': 'attribute_value' },\n # },\n # ]" }, { "urn": "urn:li:dataPlatform:glue", @@ -223,4 +223,4 @@ "docsUrl": "https://datahubproject.io/docs/metadata-ingestion/", "recipe": "source:\n type: \n config:\n # Source-type specifics config\n " } -] \ No newline at end of file +] diff --git a/datahub-web-react/src/app/ingest/source/builder/types.ts b/datahub-web-react/src/app/ingest/source/builder/types.ts index cfe0f27ae7dbe3..2df467b7beba1f 100644 --- a/datahub-web-react/src/app/ingest/source/builder/types.ts +++ b/datahub-web-react/src/app/ingest/source/builder/types.ts @@ -34,6 +34,18 @@ export type StepProps = { ingestionSources: SourceConfig[]; }; +export type StringMapEntryInput = { + /** + * The key of the map entry + */ + key: string; + + /** + * The value fo the map entry + */ + value: string; +}; + /** * The object represents the state of the Ingestion Source Builder form. */ @@ -91,5 +103,10 @@ export interface SourceBuilderState { * Advanced: Whether or not to run this ingestion source in debug mode */ debugMode?: boolean | null; + + /** + * Advanced: Extra arguments for the ingestion run. + */ + extraArgs?: StringMapEntryInput[] | null; }; } diff --git a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx index 849efabdcde97b..00fdc89964f88a 100644 --- a/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx +++ b/datahub-web-react/src/app/ingest/source/executions/ExecutionRequestDetailsModal.tsx @@ -2,6 +2,7 @@ import { DownloadOutlined } from '@ant-design/icons'; import { Button, message, Modal, Typography } from 'antd'; import React, { useEffect, useState } from 'react'; import styled from 'styled-components'; +import YAML from 'yamljs'; import { useGetIngestionExecutionRequestQuery } from '../../../../graphql/ingestion.generated'; import { ANTD_GRAY } from '../../../entity/shared/constants'; import { downloadFile } from '../../../search/utils/csvUtils'; @@ -65,6 +66,13 @@ const IngestedAssetsSection = styled.div` padding-right: 30px; `; +const RecipeSection = styled.div` + border-top: 1px solid ${ANTD_GRAY[4]}; + padding-top: 16px; + padding-left: 30px; + padding-right: 30px; +`; + const LogsSection = styled.div` padding-top: 16px; padding-left: 30px; @@ -91,6 +99,8 @@ type Props = { export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { const [showExpandedLogs, setShowExpandedLogs] = useState(false); + const [showExpandedRecipe, setShowExpandedRecipe] = useState(false); + const { data, loading, error, refetch } = useGetIngestionExecutionRequestQuery({ variables: { urn } }); const output = data?.executionRequest?.result?.report || 'No output found.'; @@ -120,7 +130,18 @@ export const ExecutionDetailsModal = ({ urn, visible, onClose }: Props) => { const resultSummaryText = (result && {getExecutionRequestSummaryText(result)}) || undefined; - const isOutputExpandable = output.length > 100; + + const recipeJson = data?.executionRequest?.input.arguments?.find((arg) => arg.key === 'recipe')?.value; + let recipeYaml: string; + try { + recipeYaml = recipeJson && YAML.stringify(JSON.parse(recipeJson), 8, 2).trim(); + } catch (e) { + recipeYaml = ''; + } + const recipe = showExpandedRecipe ? recipeYaml : recipeYaml?.split('\n').slice(0, 1).join('\n'); + + const areLogsExpandable = output.length > 100; + const isRecipeExpandable = recipeYaml?.includes('\n'); return ( { -
{`${logs}${!showExpandedLogs && isOutputExpandable ? '...' : ''}`}
- {isOutputExpandable && ( +
{`${logs}${!showExpandedLogs && areLogsExpandable ? '...' : ''}`}
+ {areLogsExpandable && ( setShowExpandedLogs(!showExpandedLogs)}> {showExpandedLogs ? 'Hide' : 'Show More'} )}
+ {recipe && ( + + Recipe + + + The recipe used for this ingestion run. + + + +
{`${recipe}${!showExpandedRecipe && isRecipeExpandable ? '\n...' : ''}`}
+
+ {isRecipeExpandable && ( + setShowExpandedRecipe((v) => !v)}> + {showExpandedRecipe ? 'Hide' : 'Show More'} + + )} +
+ )}
); diff --git a/datahub-web-react/src/app/ingest/source/utils.ts b/datahub-web-react/src/app/ingest/source/utils.ts index c372388e958b78..f789ed8434721d 100644 --- a/datahub-web-react/src/app/ingest/source/utils.ts +++ b/datahub-web-react/src/app/ingest/source/utils.ts @@ -1,17 +1,19 @@ -import YAML from 'yamljs'; import { CheckCircleOutlined, ClockCircleOutlined, CloseCircleOutlined, + ExclamationCircleOutlined, LoadingOutlined, + StopOutlined, WarningOutlined, } from '@ant-design/icons'; -import { ANTD_GRAY, REDESIGN_COLORS } from '../../entity/shared/constants'; +import YAML from 'yamljs'; +import { ListIngestionSourcesDocument, ListIngestionSourcesQuery } from '../../../graphql/ingestion.generated'; import { EntityType, FacetMetadata } from '../../../types.generated'; -import { capitalizeFirstLetterOnly, pluralize } from '../../shared/textUtil'; import EntityRegistry from '../../entity/EntityRegistry'; +import { ANTD_GRAY, REDESIGN_COLORS } from '../../entity/shared/constants'; +import { capitalizeFirstLetterOnly, pluralize } from '../../shared/textUtil'; import { SourceConfig } from './builder/types'; -import { ListIngestionSourcesDocument, ListIngestionSourcesQuery } from '../../../graphql/ingestion.generated'; export const getSourceConfigs = (ingestionSources: SourceConfig[], sourceType: string) => { const sourceConfigs = ingestionSources.find((source) => source.name === sourceType); @@ -40,7 +42,9 @@ export function getPlaceholderRecipe(ingestionSources: SourceConfig[], type?: st export const RUNNING = 'RUNNING'; export const SUCCESS = 'SUCCESS'; +export const WARNING = 'WARNING'; export const FAILURE = 'FAILURE'; +export const CONNECTION_FAILURE = 'CONNECTION_FAILURE'; export const CANCELLED = 'CANCELLED'; export const UP_FOR_RETRY = 'UP_FOR_RETRY'; export const ROLLING_BACK = 'ROLLING_BACK'; @@ -56,8 +60,10 @@ export const getExecutionRequestStatusIcon = (status: string) => { return ( (status === RUNNING && LoadingOutlined) || (status === SUCCESS && CheckCircleOutlined) || + (status === WARNING && ExclamationCircleOutlined) || (status === FAILURE && CloseCircleOutlined) || - (status === CANCELLED && CloseCircleOutlined) || + (status === CONNECTION_FAILURE && CloseCircleOutlined) || + (status === CANCELLED && StopOutlined) || (status === UP_FOR_RETRY && ClockCircleOutlined) || (status === ROLLED_BACK && WarningOutlined) || (status === ROLLING_BACK && LoadingOutlined) || @@ -70,7 +76,9 @@ export const getExecutionRequestStatusDisplayText = (status: string) => { return ( (status === RUNNING && 'Running') || (status === SUCCESS && 'Succeeded') || + (status === WARNING && 'Completed') || (status === FAILURE && 'Failed') || + (status === CONNECTION_FAILURE && 'Connection Failed') || (status === CANCELLED && 'Cancelled') || (status === UP_FOR_RETRY && 'Up for Retry') || (status === ROLLED_BACK && 'Rolled Back') || @@ -83,21 +91,25 @@ export const getExecutionRequestStatusDisplayText = (status: string) => { export const getExecutionRequestSummaryText = (status: string) => { switch (status) { case RUNNING: - return 'Ingestion is running'; + return 'Ingestion is running...'; case SUCCESS: - return 'Ingestion successfully completed'; + return 'Ingestion succeeded with no errors or suspected missing data.'; + case WARNING: + return 'Ingestion completed with minor or intermittent errors.'; case FAILURE: - return 'Ingestion completed with errors'; + return 'Ingestion failed to complete, or completed with serious errors.'; + case CONNECTION_FAILURE: + return 'Ingestion failed due to network, authentication, or permission issues.'; case CANCELLED: - return 'Ingestion was cancelled'; + return 'Ingestion was cancelled.'; case ROLLED_BACK: - return 'Ingestion was rolled back'; + return 'Ingestion was rolled back.'; case ROLLING_BACK: - return 'Ingestion is in the process of rolling back'; + return 'Ingestion is in the process of rolling back.'; case ROLLBACK_FAILED: - return 'Ingestion rollback failed'; + return 'Ingestion rollback failed.'; default: - return 'Ingestion status not recognized'; + return 'Ingestion status not recognized.'; } }; @@ -105,7 +117,9 @@ export const getExecutionRequestStatusDisplayColor = (status: string) => { return ( (status === RUNNING && REDESIGN_COLORS.BLUE) || (status === SUCCESS && 'green') || + (status === WARNING && 'orangered') || (status === FAILURE && 'red') || + (status === CONNECTION_FAILURE && 'crimson') || (status === UP_FOR_RETRY && 'orange') || (status === CANCELLED && ANTD_GRAY[9]) || (status === ROLLED_BACK && 'orange') || diff --git a/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx b/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx index 68e91983babdbc..d3e01df3a66e84 100644 --- a/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx +++ b/datahub-web-react/src/app/permissions/policy/PolicyDetailsModal.tsx @@ -67,8 +67,8 @@ export default function PolicyDetailsModal({ policy, visible, onClose, privilege const isMetadataPolicy = policy?.type === PolicyType.Metadata; const resources = convertLegacyResourceFilter(policy?.resources); - const resourceTypes = getFieldValues(resources?.filter, 'RESOURCE_TYPE') || []; - const resourceEntities = getFieldValues(resources?.filter, 'RESOURCE_URN') || []; + const resourceTypes = getFieldValues(resources?.filter, 'TYPE') || []; + const resourceEntities = getFieldValues(resources?.filter, 'URN') || []; const domains = getFieldValues(resources?.filter, 'DOMAIN') || []; const { diff --git a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx index 1520388a5033a9..b8e1505fceaeca 100644 --- a/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx +++ b/datahub-web-react/src/app/permissions/policy/PolicyPrivilegeForm.tsx @@ -67,8 +67,8 @@ export default function PolicyPrivilegeForm({ } = useAppConfig(); const resources: ResourceFilter = convertLegacyResourceFilter(maybeResources) || EMPTY_POLICY.resources; - const resourceTypes = getFieldValues(resources.filter, 'RESOURCE_TYPE') || []; - const resourceEntities = getFieldValues(resources.filter, 'RESOURCE_URN') || []; + const resourceTypes = getFieldValues(resources.filter, 'TYPE') || []; + const resourceEntities = getFieldValues(resources.filter, 'URN') || []; const getDisplayName = (entity) => { if (!entity) { @@ -145,10 +145,7 @@ export default function PolicyPrivilegeForm({ }; setResources({ ...resources, - filter: setFieldValues(filter, 'RESOURCE_TYPE', [ - ...resourceTypes, - createCriterionValue(selectedResourceType), - ]), + filter: setFieldValues(filter, 'TYPE', [...resourceTypes, createCriterionValue(selectedResourceType)]), }); }; @@ -160,7 +157,7 @@ export default function PolicyPrivilegeForm({ ...resources, filter: setFieldValues( filter, - 'RESOURCE_TYPE', + 'TYPE', resourceTypes?.filter((criterionValue) => criterionValue.value !== deselectedResourceType), ), }); @@ -173,7 +170,7 @@ export default function PolicyPrivilegeForm({ }; setResources({ ...resources, - filter: setFieldValues(filter, 'RESOURCE_URN', [ + filter: setFieldValues(filter, 'URN', [ ...resourceEntities, createCriterionValueWithEntity( resource, @@ -192,7 +189,7 @@ export default function PolicyPrivilegeForm({ ...resources, filter: setFieldValues( filter, - 'RESOURCE_URN', + 'URN', resourceEntities?.filter((criterionValue) => criterionValue.value !== resource), ), }); diff --git a/datahub-web-react/src/app/permissions/policy/policyUtils.ts b/datahub-web-react/src/app/permissions/policy/policyUtils.ts index c7af7342f6efa5..2f178fcdeb5c34 100644 --- a/datahub-web-react/src/app/permissions/policy/policyUtils.ts +++ b/datahub-web-react/src/app/permissions/policy/policyUtils.ts @@ -99,10 +99,10 @@ export const convertLegacyResourceFilter = (resourceFilter: Maybe(); if (resourceFilter.type) { - criteria.push(createCriterion('RESOURCE_TYPE', [createCriterionValue(resourceFilter.type)])); + criteria.push(createCriterion('TYPE', [createCriterionValue(resourceFilter.type)])); } if (resourceFilter.resources && resourceFilter.resources.length > 0) { - criteria.push(createCriterion('RESOURCE_URN', resourceFilter.resources.map(createCriterionValue))); + criteria.push(createCriterion('URN', resourceFilter.resources.map(createCriterionValue))); } return { filter: { diff --git a/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx b/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx index d5722429aaf6b3..2bb76714d6119d 100644 --- a/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx +++ b/datahub-web-react/src/app/preview/EntityPaths/EntityPathsModal.tsx @@ -39,6 +39,7 @@ export default function EntityPathsModal({ paths, resultEntityUrn, hideModal }: return ( Column path{paths.length > 1 && 's'} from{' '} diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx index fb10e1ca0026eb..5f797e68fe0e8b 100644 --- a/datahub-web-react/src/app/search/SearchBar.tsx +++ b/datahub-web-react/src/app/search/SearchBar.tsx @@ -6,7 +6,7 @@ import { useHistory } from 'react-router'; import { AutoCompleteResultForEntity, EntityType, FacetFilterInput, ScenarioType } from '../../types.generated'; import EntityRegistry from '../entity/EntityRegistry'; import filterSearchQuery from './utils/filterSearchQuery'; -import { ANTD_GRAY, ANTD_GRAY_V2 } from '../entity/shared/constants'; +import { ANTD_GRAY, ANTD_GRAY_V2, REDESIGN_COLORS } from '../entity/shared/constants'; import { getEntityPath } from '../entity/shared/containers/profile/utils'; import { EXACT_SEARCH_PREFIX } from './utils/constants'; import { useListRecommendationsQuery } from '../../graphql/recommendations.generated'; @@ -20,7 +20,6 @@ import RecommendedOption from './autoComplete/RecommendedOption'; import SectionHeader, { EntityTypeLabel } from './autoComplete/SectionHeader'; import { useUserContext } from '../context/useUserContext'; import { navigateToSearchUrl } from './utils/navigateToSearchUrl'; -import { getQuickFilterDetails } from './autoComplete/quickFilters/utils'; import ViewAllSearchItem from './ViewAllSearchItem'; import { ViewSelect } from '../entity/view/select/ViewSelect'; import { combineSiblingsInAutoComplete } from './utils/combineSiblingsInAutoComplete'; @@ -39,13 +38,14 @@ const StyledSearchBar = styled(Input)` &&& { border-radius: 70px; height: 40px; - font-size: 20px; - color: ${ANTD_GRAY[7]}; - background-color: ${ANTD_GRAY_V2[2]}; - } - > .ant-input { font-size: 14px; + color: ${ANTD_GRAY[7]}; background-color: ${ANTD_GRAY_V2[2]}; + border: 2px solid transparent; + + &:focus-within { + border: 1.5px solid ${REDESIGN_COLORS.BLUE}; + } } > .ant-input::placeholder { color: ${ANTD_GRAY_V2[10]}; @@ -119,6 +119,7 @@ interface Props { setIsSearchBarFocused?: (isSearchBarFocused: boolean) => void; onFocus?: () => void; onBlur?: () => void; + showViewAllResults?: boolean; } const defaultProps = { @@ -146,6 +147,7 @@ export const SearchBar = ({ setIsSearchBarFocused, onFocus, onBlur, + showViewAllResults = false, }: Props) => { const history = useHistory(); const [searchQuery, setSearchQuery] = useState(initialQuery); @@ -203,23 +205,16 @@ export const SearchBar = ({ const { quickFilters, selectedQuickFilter, setSelectedQuickFilter } = useQuickFiltersContext(); const autoCompleteQueryOptions = useMemo(() => { - const query = suggestions.length ? effectiveQuery : ''; - const selectedQuickFilterLabel = - showQuickFilters && selectedQuickFilter - ? getQuickFilterDetails(selectedQuickFilter, entityRegistry).label - : ''; - const text = query || selectedQuickFilterLabel; - - if (!text) return []; + if (effectiveQuery === '' || !showViewAllResults) return []; return [ { - value: `${EXACT_SEARCH_PREFIX}${text}`, - label: , + value: `${EXACT_SEARCH_PREFIX}${effectiveQuery}`, + label: , type: EXACT_AUTOCOMPLETE_OPTION_TYPE, }, ]; - }, [showQuickFilters, suggestions.length, effectiveQuery, selectedQuickFilter, entityRegistry]); + }, [effectiveQuery, showViewAllResults]); const autoCompleteEntityOptions = useMemo(() => { return suggestions.map((suggestion: AutoCompleteResultForEntity) => { @@ -296,6 +291,22 @@ export const SearchBar = ({ } } + const searchInputRef = useRef(null); + + useEffect(() => { + const handleKeyDown = (event) => { + // Support command-k to select the search bar. + // 75 is the keyCode for 'k' + if ((event.metaKey || event.ctrlKey) && event.keyCode === 75) { + (searchInputRef?.current as any)?.focus(); + } + }; + document.addEventListener('keydown', handleKeyDown); + return () => { + document.removeEventListener('keydown', handleKeyDown); + }; + }, []); + return ( } + ref={searchInputRef} /> diff --git a/datahub-web-react/src/app/search/SearchHeader.tsx b/datahub-web-react/src/app/search/SearchHeader.tsx index 74bc562e275d11..91f9753a3d6012 100644 --- a/datahub-web-react/src/app/search/SearchHeader.tsx +++ b/datahub-web-react/src/app/search/SearchHeader.tsx @@ -107,6 +107,7 @@ export const SearchHeader = ({ combineSiblings fixAutoComplete showQuickFilters + showViewAllResults /> diff --git a/datahub-web-react/src/app/search/useGetSearchQueryInputs.ts b/datahub-web-react/src/app/search/useGetSearchQueryInputs.ts index 05419e5abed35b..9a3af8fb8d56c7 100644 --- a/datahub-web-react/src/app/search/useGetSearchQueryInputs.ts +++ b/datahub-web-react/src/app/search/useGetSearchQueryInputs.ts @@ -3,7 +3,7 @@ import { useLocation, useParams } from 'react-router'; import { useMemo } from 'react'; import { FacetFilterInput, EntityType } from '../../types.generated'; import { useEntityRegistry } from '../useEntityRegistry'; -import { ENTITY_FILTER_NAME, FILTER_DELIMITER, UnionType } from './utils/constants'; +import { ENTITY_FILTER_NAME, UnionType } from './utils/constants'; import { useUserContext } from '../context/useUserContext'; import useFilters from './utils/useFilters'; import { generateOrFilters } from './utils/generateOrFilters'; @@ -27,12 +27,6 @@ export default function useGetSearchQueryInputs(excludedFilterFields?: Array = useFilters(params); - const nonNestedFilters = filters.filter( - (f) => !f.field.includes(FILTER_DELIMITER) && !excludedFilterFields?.includes(f.field), - ); - const nestedFilters = filters.filter( - (f) => f.field.includes(FILTER_DELIMITER) && !excludedFilterFields?.includes(f.field), - ); const entityFilters: Array = useMemo( () => filters @@ -43,8 +37,8 @@ export default function useGetSearchQueryInputs(excludedFilterFields?: Array generateOrFilters(unionType, nonNestedFilters, nestedFilters), - [nonNestedFilters, nestedFilters, unionType], + () => generateOrFilters(unionType, filters, excludedFilterFields), + [filters, excludedFilterFields, unionType], ); return { entityFilters, query, unionType, filters, orFilters, viewUrn, page, activeType, sortInput }; diff --git a/datahub-web-react/src/app/search/utils/__tests__/generateOrFilters.test.ts b/datahub-web-react/src/app/search/utils/__tests__/generateOrFilters.test.ts index 505c50efb289fa..fd5a5691b454ef 100644 --- a/datahub-web-react/src/app/search/utils/__tests__/generateOrFilters.test.ts +++ b/datahub-web-react/src/app/search/utils/__tests__/generateOrFilters.test.ts @@ -1,7 +1,7 @@ import { DOMAINS_FILTER_NAME, ENTITY_SUB_TYPE_FILTER_NAME, - ENTITY_TYPE_FILTER_NAME, + ENTITY_FILTER_NAME, TAGS_FILTER_NAME, UnionType, } from '../constants'; @@ -10,7 +10,7 @@ import { generateOrFilters } from '../generateOrFilters'; describe('generateOrFilters', () => { it('should generate orFilters with UnionType.AND', () => { const filters = [ - { field: ENTITY_TYPE_FILTER_NAME, values: ['DATASET', 'CONTAINER'] }, + { field: ENTITY_FILTER_NAME, values: ['DATASET', 'CONTAINER'] }, { field: TAGS_FILTER_NAME, values: ['urn:li:tag:tag1'] }, ]; const orFilters = generateOrFilters(UnionType.AND, filters); @@ -24,7 +24,7 @@ describe('generateOrFilters', () => { it('should generate orFilters with UnionType.OR', () => { const filters = [ - { field: ENTITY_TYPE_FILTER_NAME, values: ['DATASET', 'CONTAINER'] }, + { field: ENTITY_FILTER_NAME, values: ['DATASET', 'CONTAINER'] }, { field: TAGS_FILTER_NAME, values: ['urn:li:tag:tag1'] }, ]; const orFilters = generateOrFilters(UnionType.OR, filters); @@ -43,17 +43,23 @@ describe('generateOrFilters', () => { const filters = [ { field: TAGS_FILTER_NAME, values: ['urn:li:tag:tag1'] }, { field: DOMAINS_FILTER_NAME, values: ['urn:li:domains:domain1'] }, + { field: ENTITY_SUB_TYPE_FILTER_NAME, values: ['CONTAINER', 'DATASET␞table'] }, ]; - const nestedFilters = [{ field: ENTITY_SUB_TYPE_FILTER_NAME, values: ['CONTAINER', 'DATASET␞table'] }]; - const orFilters = generateOrFilters(UnionType.AND, filters, nestedFilters); + // const nestedFilters = [{ field: ENTITY_SUB_TYPE_FILTER_NAME, values: ['CONTAINER', 'DATASET␞table'] }]; + const orFilters = generateOrFilters(UnionType.AND, filters); expect(orFilters).toMatchObject([ { - and: [...filters, { field: '_entityType', values: ['CONTAINER'] }], + and: [ + { field: TAGS_FILTER_NAME, values: ['urn:li:tag:tag1'] }, + { field: DOMAINS_FILTER_NAME, values: ['urn:li:domains:domain1'] }, + { field: '_entityType', values: ['CONTAINER'] }, + ], }, { and: [ - ...filters, + { field: TAGS_FILTER_NAME, values: ['urn:li:tag:tag1'] }, + { field: DOMAINS_FILTER_NAME, values: ['urn:li:domains:domain1'] }, { field: '_entityType', values: ['DATASET'] }, { field: 'typeNames', values: ['table'] }, ], @@ -65,9 +71,9 @@ describe('generateOrFilters', () => { const filters = [ { field: TAGS_FILTER_NAME, values: ['urn:li:tag:tag1'] }, { field: DOMAINS_FILTER_NAME, values: ['urn:li:domains:domain1'] }, + { field: ENTITY_SUB_TYPE_FILTER_NAME, values: ['CONTAINER', 'DATASET␞table'] }, ]; - const nestedFilters = [{ field: ENTITY_SUB_TYPE_FILTER_NAME, values: ['CONTAINER', 'DATASET␞table'] }]; - const orFilters = generateOrFilters(UnionType.OR, filters, nestedFilters); + const orFilters = generateOrFilters(UnionType.OR, filters); expect(orFilters).toMatchObject([ { @@ -87,4 +93,18 @@ describe('generateOrFilters', () => { }, ]); }); + + it('should generate orFilters and exclude filters with a provided exclude field', () => { + const filters = [ + { field: ENTITY_FILTER_NAME, values: ['DATASET', 'CONTAINER'] }, + { field: TAGS_FILTER_NAME, values: ['urn:li:tag:tag1'] }, + ]; + const orFilters = generateOrFilters(UnionType.AND, filters, [ENTITY_FILTER_NAME]); + + expect(orFilters).toMatchObject([ + { + and: [{ field: TAGS_FILTER_NAME, values: ['urn:li:tag:tag1'] }], + }, + ]); + }); }); diff --git a/datahub-web-react/src/app/search/utils/generateOrFilters.ts b/datahub-web-react/src/app/search/utils/generateOrFilters.ts index b665a2e0f0495b..fa2939b3436f5f 100644 --- a/datahub-web-react/src/app/search/utils/generateOrFilters.ts +++ b/datahub-web-react/src/app/search/utils/generateOrFilters.ts @@ -26,20 +26,26 @@ function generateInputWithNestedFilters(filters: FacetFilterInput[], nestedFilte export function generateOrFilters( unionType: UnionType, filters: FacetFilterInput[], - nestedFilters: FacetFilterInput[] = [], + excludedFilterFields: string[] = [], ): AndFilterInput[] { - if ((filters?.length || 0) === 0 && nestedFilters.length === 0) { + if ((filters?.length || 0) === 0) { return []; } + const nonNestedFilters = filters.filter( + (f) => !f.field.includes(FILTER_DELIMITER) && !excludedFilterFields?.includes(f.field), + ); + const nestedFilters = filters.filter( + (f) => f.field.includes(FILTER_DELIMITER) && !excludedFilterFields?.includes(f.field), + ); if (unionType === UnionType.OR) { const orFiltersWithNestedFilters = generateInputWithNestedFilters([], nestedFilters); - const orFilters = filters.map((filter) => ({ + const orFilters = nonNestedFilters.map((filter) => ({ and: [filter], })); return [...orFilters, ...orFiltersWithNestedFilters]; } - const andFiltersWithNestedFilters = generateInputWithNestedFilters(filters, nestedFilters); + const andFiltersWithNestedFilters = generateInputWithNestedFilters(nonNestedFilters, nestedFilters); if (andFiltersWithNestedFilters.length) { return andFiltersWithNestedFilters; @@ -47,7 +53,7 @@ export function generateOrFilters( return [ { - and: filters, + and: nonNestedFilters, }, ]; } diff --git a/datahub-web-react/src/app/settings/AccessTokenModal.tsx b/datahub-web-react/src/app/settings/AccessTokenModal.tsx index 0303db656c2a82..10427210d06928 100644 --- a/datahub-web-react/src/app/settings/AccessTokenModal.tsx +++ b/datahub-web-react/src/app/settings/AccessTokenModal.tsx @@ -60,7 +60,7 @@ export const AccessTokenModal = ({ visible, onClose, accessToken, expiresInText onCancel={onClose} footer={ <> - @@ -81,7 +81,7 @@ export const AccessTokenModal = ({ visible, onClose, accessToken, expiresInText Token {expiresInText} -
{accessToken}
+
{accessToken}
diff --git a/datahub-web-react/src/app/settings/AccessTokens.tsx b/datahub-web-react/src/app/settings/AccessTokens.tsx index 02ff3f1cd304c9..c7a015de392da3 100644 --- a/datahub-web-react/src/app/settings/AccessTokens.tsx +++ b/datahub-web-react/src/app/settings/AccessTokens.tsx @@ -199,7 +199,12 @@ export const AccessTokens = () => { key: 'x', render: (_, record: any) => ( - diff --git a/datahub-web-react/src/app/settings/CreateTokenModal.tsx b/datahub-web-react/src/app/settings/CreateTokenModal.tsx index 6038a86e233035..3cc446651efcbc 100644 --- a/datahub-web-react/src/app/settings/CreateTokenModal.tsx +++ b/datahub-web-react/src/app/settings/CreateTokenModal.tsx @@ -117,10 +117,15 @@ export default function CreateTokenModal({ currentUserUrn, visible, onClose, onC onCancel={onModalClose} footer={ <> - - @@ -148,18 +153,21 @@ export default function CreateTokenModal({ currentUserUrn, visible, onClose, onC ]} hasFeedback > - + Description}> An optional description for your new token. - + Expires in - + {ACCESS_TOKEN_DURATIONS.map((duration) => ( diff --git a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx index ced7d8642576b3..ce1ad93565ba43 100644 --- a/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx +++ b/datahub-web-react/src/app/shared/admin/HeaderLinks.tsx @@ -93,20 +93,6 @@ export function HeaderLinks(props: Props) { )} - {showIngestion && ( - - - - - - )} + {showIngestion && ( + + + + + + )} {showSettings && ( diff --git a/datahub-web-react/src/conf/theme/theme_dark.config.json b/datahub-web-react/src/conf/theme/theme_dark.config.json index 9746c3ddde5f37..54ebebd3b692b3 100644 --- a/datahub-web-react/src/conf/theme/theme_dark.config.json +++ b/datahub-web-react/src/conf/theme/theme_dark.config.json @@ -30,7 +30,7 @@ "homepageMessage": "Find data you can count(*) on" }, "search": { - "searchbarMessage": "Search Datasets, People, & more..." + "searchbarMessage": "Search Tables, Dashboards, People, & more..." }, "menu": { "items": [ @@ -52,4 +52,4 @@ ] } } -} +} \ No newline at end of file diff --git a/datahub-web-react/src/conf/theme/theme_light.config.json b/datahub-web-react/src/conf/theme/theme_light.config.json index 906c04e38a1baf..6b9ef3eac52b0f 100644 --- a/datahub-web-react/src/conf/theme/theme_light.config.json +++ b/datahub-web-react/src/conf/theme/theme_light.config.json @@ -33,7 +33,7 @@ "homepageMessage": "Find data you can count on" }, "search": { - "searchbarMessage": "Search Datasets, People, & more..." + "searchbarMessage": "Search Tables, Dashboards, People, & more..." }, "menu": { "items": [ @@ -60,4 +60,4 @@ ] } } -} +} \ No newline at end of file diff --git a/datahub-web-react/src/graphql/group.graphql b/datahub-web-react/src/graphql/group.graphql index 9aa6e2b005f16c..1007721e51a4ec 100644 --- a/datahub-web-react/src/graphql/group.graphql +++ b/datahub-web-react/src/graphql/group.graphql @@ -3,6 +3,7 @@ query getGroup($urn: String!, $membersCount: Int!) { urn type name + exists origin { type externalType diff --git a/datahub-web-react/src/graphql/ingestion.graphql b/datahub-web-react/src/graphql/ingestion.graphql index 80f66642fe11f8..1767fe34bfef08 100644 --- a/datahub-web-react/src/graphql/ingestion.graphql +++ b/datahub-web-react/src/graphql/ingestion.graphql @@ -12,6 +12,10 @@ query listIngestionSources($input: ListIngestionSourcesInput!) { version executorId debugMode + extraArgs { + key + value + } } schedule { interval @@ -51,6 +55,10 @@ query getIngestionSource($urn: String!, $runStart: Int, $runCount: Int) { version executorId debugMode + extraArgs { + key + value + } } schedule { interval @@ -90,6 +98,10 @@ query getIngestionExecutionRequest($urn: String!) { source { type } + arguments { + key + value + } } result { status diff --git a/datahub-web-react/src/graphql/scroll.graphql b/datahub-web-react/src/graphql/scroll.graphql index 18274c50c2166a..1031fed7b9e136 100644 --- a/datahub-web-react/src/graphql/scroll.graphql +++ b/datahub-web-react/src/graphql/scroll.graphql @@ -408,6 +408,7 @@ fragment downloadScrollAcrossLineageResult on ScrollAcrossLineageResults { count total searchResults { + degree entity { ...downloadSearchResults } diff --git a/datahub-web-react/src/images/verticalogo.png b/datahub-web-react/src/images/verticalogo.png index a81047fd43edbf..5da38f4e67c7d4 100644 Binary files a/datahub-web-react/src/images/verticalogo.png and b/datahub-web-react/src/images/verticalogo.png differ diff --git a/docker/airflow/local_airflow.md b/docker/airflow/local_airflow.md index 55a64f5c122c51..fbfc1d17327c53 100644 --- a/docker/airflow/local_airflow.md +++ b/docker/airflow/local_airflow.md @@ -1,6 +1,6 @@ :::caution -This feature is currently unmaintained. As of 0.10.0 the container described is not published alongside the DataHub CLI. If you'd like to use it, please reach out to us on the [community slack.](docs/slack.md) +This guide is currently unmaintained. As of 0.10.0 the container described is not published alongside the DataHub CLI. If you'd like to use it, please reach out to us on the [community slack.](docs/slack.md) ::: diff --git a/docker/broker/env/docker.env b/docker/broker/env/docker.env index 18115697c2832f..6eb958609daf12 100644 --- a/docker/broker/env/docker.env +++ b/docker/broker/env/docker.env @@ -5,4 +5,6 @@ KAFKA_ADVERTISED_LISTENERS=PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9 KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR=1 KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0 KAFKA_HEAP_OPTS=-Xms256m -Xmx256m -KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE=false \ No newline at end of file +KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE=false +KAFKA_MESSAGE_MAX_BYTES=5242880 +KAFKA_MAX_MESSAGE_BYTES=5242880 \ No newline at end of file diff --git a/docker/build.gradle b/docker/build.gradle index 0faea626e982d9..56634a5fe0c675 100644 --- a/docker/build.gradle +++ b/docker/build.gradle @@ -35,7 +35,7 @@ task quickstart(type: Exec, dependsOn: ':metadata-ingestion:install') { environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" - // environment "ACTIONS_VERSION", 'alpine3.17-slim' + // environment "ACTIONS_VERSION", 'alpine3.18-slim' // environment "DATAHUB_ACTIONS_IMAGE", 'nginx' // Elastic @@ -97,10 +97,20 @@ task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') { dependsOn(debug_modules.collect { it + ':dockerTagDebug' }) shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke' - environment "DATAHUB_PRECREATE_TOPICS", "true" environment "DATAHUB_TELEMETRY_ENABLED", "false" environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}" + // Elastic + // environment "DATAHUB_SEARCH_IMAGE", 'elasticsearch' + // environment "DATAHUB_SEARCH_TAG", '7.10.1' + + // OpenSearch + environment "DATAHUB_SEARCH_IMAGE", 'opensearchproject/opensearch' + environment "DATAHUB_SEARCH_TAG", '2.9.0' + environment "XPACK_SECURITY_ENABLED", 'plugins.security.disabled=true' + environment "USE_AWS_ELASTICSEARCH", 'true' + + def cmd = [ 'source ../metadata-ingestion/venv/bin/activate && ', 'datahub docker quickstart', diff --git a/docker/datahub-frontend/Dockerfile b/docker/datahub-frontend/Dockerfile index 9efc0d2ce8753e..9c13e730780421 100644 --- a/docker/datahub-frontend/Dockerfile +++ b/docker/datahub-frontend/Dockerfile @@ -8,10 +8,12 @@ RUN addgroup -S datahub && adduser -S datahub -G datahub # Upgrade Alpine and base packages # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ - && apk --no-cache add curl sqlite \ + && apk --no-cache add curl sqlite libc6-compat java-snappy \ && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && apk --no-cache add jattach --repository http://dl-cdn.alpinelinux.org/alpine/edge/community/ +ENV LD_LIBRARY_PATH="/lib:/lib64" + FROM base as prod-install COPY ./datahub-frontend.zip / diff --git a/docker/datahub-gms/Dockerfile b/docker/datahub-gms/Dockerfile index 2d74a288b8c995..e271188a703ccf 100644 --- a/docker/datahub-gms/Dockerfile +++ b/docker/datahub-gms/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder @@ -18,7 +18,7 @@ FROM alpine:3 AS base ENV JMX_VERSION=0.18.0 # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ - && apk --no-cache add curl bash coreutils gcompat sqlite \ + && apk --no-cache add curl bash coreutils gcompat sqlite libc6-compat java-snappy \ && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && apk --no-cache add jattach --repository http://dl-cdn.alpinelinux.org/alpine/edge/community/ \ && curl -sS https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-runner/9.4.46.v20220331/jetty-runner-9.4.46.v20220331.jar --output jetty-runner.jar \ @@ -29,6 +29,8 @@ RUN apk --no-cache --update-cache --available upgrade \ && cp /usr/lib/jvm/java-11-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks COPY --from=binary /go/bin/dockerize /usr/local/bin +ENV LD_LIBRARY_PATH="/lib:/lib64" + FROM base as prod-install COPY war.war /datahub/datahub-gms/bin/war.war COPY metadata-models/src/main/resources/entity-registry.yml /datahub/datahub-gms/resources/entity-registry.yml diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 564cc19cc9a5f9..25afe9b8b3dce0 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -1,7 +1,7 @@ ARG APP_ENV=full ARG BASE_IMAGE=base -FROM golang:1-alpine3.17 AS dockerize-binary +FROM golang:1-alpine3.18 AS dockerize-binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt index 82d9a93a9a2c3c..eb082d50b3020f 100644 --- a/docker/datahub-ingestion-base/base-requirements.txt +++ b/docker/datahub-ingestion-base/base-requirements.txt @@ -2,62 +2,58 @@ # pyspark==3.0.3 # pydeequ==1.0.1 -acryl-datahub-classify==0.0.6 -acryl-iceberg-legacy==0.0.4 -acryl-PyHive==0.6.13 -aenum==3.1.12 -aiohttp==3.8.4 +acryl-datahub-classify==0.0.8 +acryl-PyHive==0.6.14 +acryl-sqlglot==18.5.2.dev45 +aenum==3.1.15 +aiohttp==3.8.6 aiosignal==1.3.1 -alembic==1.11.1 +alembic==1.12.0 altair==4.2.0 -anyio==3.7.0 -apache-airflow==2.6.1 -apache-airflow-providers-common-sql==1.5.1 -apache-airflow-providers-ftp==3.4.1 -apache-airflow-providers-http==4.4.1 -apache-airflow-providers-imap==3.2.1 -apache-airflow-providers-sqlite==3.4.1 -apispec==5.2.2 +anyio==3.7.1 +apache-airflow==2.7.2 +apache-airflow-providers-common-sql==1.7.2 +apache-airflow-providers-ftp==3.5.2 +apache-airflow-providers-http==4.5.2 +apache-airflow-providers-imap==3.3.2 +apache-airflow-providers-sqlite==3.4.3 +apispec==6.3.0 appdirs==1.4.4 appnope==0.1.3 -argcomplete==3.0.8 -argon2-cffi==21.3.0 +argcomplete==3.1.2 +argon2-cffi==23.1.0 argon2-cffi-bindings==21.2.0 asgiref==3.7.2 asn1crypto==1.5.1 -asttokens==2.2.1 -async-timeout==4.0.2 +asttokens==2.4.0 +async-timeout==4.0.3 asynch==0.2.2 attrs==23.1.0 avro==1.10.2 -avro-gen3==0.7.10 -azure-core==1.26.4 -azure-identity==1.10.0 -azure-storage-blob==12.16.0 -azure-storage-file-datalake==12.11.0 -Babel==2.12.1 +avro-gen3==0.7.11 +Babel==2.13.0 backcall==0.2.0 backoff==2.2.1 beautifulsoup4==4.12.2 -bleach==6.0.0 -blinker==1.6.2 -blis==0.7.9 -boto3==1.26.142 -botocore==1.29.142 +bleach==6.1.0 +blinker==1.6.3 +blis==0.7.11 +boto3==1.28.62 +botocore==1.31.62 bowler==0.9.0 -bracex==2.3.post1 +bracex==2.4 cached-property==1.5.2 cachelib==0.9.0 cachetools==5.3.1 -catalogue==2.0.8 -cattrs==22.2.0 -certifi==2023.5.7 -cffi==1.15.1 -chardet==5.1.0 -charset-normalizer==2.1.1 +catalogue==2.0.10 +cattrs==23.1.2 +certifi==2023.7.22 +cffi==1.16.0 +chardet==5.2.0 +charset-normalizer==3.3.0 ciso8601==2.3.0 -click==8.1.3 -click-default-group==1.2.2 +click==8.1.7 +click-default-group==1.2.4 click-spinner==0.1.10 clickclick==20.10.2 clickhouse-cityhash==1.0.2.4 @@ -66,205 +62,217 @@ clickhouse-sqlalchemy==0.2.4 cloudpickle==2.2.1 colorama==0.4.6 colorlog==4.8.0 -confection==0.0.4 +comm==0.1.4 +confection==0.1.3 ConfigUpdater==3.1.1 confluent-kafka==1.8.2 connexion==2.14.2 cron-descriptor==1.4.0 -croniter==1.3.15 -cryptography==37.0.4 +croniter==2.0.1 +cryptography==41.0.4 cx-Oracle==8.3.0 -cymem==2.0.7 -dask==2023.5.1 -databricks-cli==0.17.7 +cymem==2.0.8 +dask==2023.9.3 +databricks-cli==0.18.0 databricks-dbapi==0.6.0 -databricks-sdk==0.1.8 -debugpy==1.6.7 +databricks-sdk==0.10.0 +debugpy==1.8.0 decorator==5.1.1 defusedxml==0.7.1 -deltalake==0.9.0 +deltalake==0.11.0 Deprecated==1.2.14 -dill==0.3.6 -dnspython==2.3.0 -docker==6.1.2 +dill==0.3.7 +dnspython==2.4.2 +docker==6.1.3 docutils==0.20.1 ecdsa==0.18.0 elasticsearch==7.13.4 email-validator==1.3.1 entrypoints==0.4 et-xmlfile==1.1.0 -exceptiongroup==1.1.1 -executing==1.2.0 -expandvars==0.9.0 -fastapi==0.95.2 -fastavro==1.7.4 -fastjsonschema==2.17.1 -feast==0.29.0 -filelock==3.12.0 +exceptiongroup==1.1.3 +executing==2.0.0 +expandvars==0.11.0 +fastapi==0.103.2 +fastavro==1.8.4 +fastjsonschema==2.18.1 +feast==0.31.1 +filelock==3.12.4 fissix==21.11.13 Flask==2.2.5 flatdict==4.0.1 -frozenlist==1.3.3 -fsspec==2023.5.0 +frozenlist==1.4.0 +fsspec==2023.9.2 future==0.18.3 -GeoAlchemy2==0.13.3 +GeoAlchemy2==0.14.1 gitdb==4.0.10 -GitPython==3.1.31 -google-api-core==2.11.0 -google-auth==2.19.0 -google-cloud-appengine-logging==1.3.0 +GitPython==3.1.37 +google-api-core==2.12.0 +google-auth==2.23.3 +google-cloud-appengine-logging==1.3.2 google-cloud-audit-log==0.2.5 -google-cloud-bigquery==3.10.0 -google-cloud-bigquery-storage==2.19.1 -google-cloud-core==2.3.2 +google-cloud-bigquery==3.12.0 +google-cloud-core==2.3.3 google-cloud-datacatalog-lineage==0.2.2 google-cloud-logging==3.5.0 google-crc32c==1.5.0 -google-resumable-media==2.5.0 -googleapis-common-protos==1.59.0 +google-re2==1.1 +google-resumable-media==2.6.0 +googleapis-common-protos==1.60.0 gql==3.4.1 graphql-core==3.2.3 graphviz==0.20.1 great-expectations==0.15.50 -greenlet==2.0.2 +greenlet==3.0.0 grpc-google-iam-v1==0.12.6 -grpcio==1.54.2 -grpcio-reflection==1.54.2 -grpcio-status==1.54.2 -grpcio-tools==1.54.2 -gssapi==1.8.2 -gunicorn==20.1.0 +grpcio==1.59.0 +grpcio-reflection==1.59.0 +grpcio-status==1.59.0 +grpcio-tools==1.59.0 +gssapi==1.8.3 +gunicorn==21.2.0 h11==0.14.0 -hmsclient==0.1.1 -httpcore==0.17.2 -httptools==0.5.0 -httpx==0.24.1 +httpcore==0.18.0 +httptools==0.6.0 +httpx==0.25.0 humanfriendly==10.0 idna==3.4 -ijson==3.2.0.post0 -importlib-metadata==6.6.0 -importlib-resources==5.12.0 +ijson==3.2.3 +importlib-metadata==6.8.0 +importlib-resources==6.1.0 inflection==0.5.1 ipaddress==1.0.23 ipykernel==6.17.1 -ipython==8.13.2 +ipython==8.16.1 ipython-genutils==0.2.0 -ipywidgets==8.0.6 +ipywidgets==8.1.1 iso3166==2.1.1 isodate==0.6.1 itsdangerous==2.1.2 -jedi==0.18.2 +jedi==0.19.1 Jinja2==3.1.2 jmespath==1.0.1 JPype1==1.4.1 -jsonlines==3.1.0 -jsonpatch==1.32 -jsonpointer==2.3 +jsonlines==4.0.0 +jsonpatch==1.33 +jsonpointer==2.4 jsonref==1.1.0 -jsonschema==4.17.3 +jsonschema==4.19.1 +jsonschema-specifications==2023.7.1 jupyter-server==1.24.0 jupyter_client==7.4.9 jupyter_core==4.12.0 jupyterlab-pygments==0.2.2 -jupyterlab-widgets==3.0.7 +jupyterlab-widgets==3.0.9 langcodes==3.3.0 lark==1.1.4 lazy-object-proxy==1.9.0 leb128==1.0.5 -limits==3.5.0 +limits==3.6.0 linear-tsv==1.1.0 linkify-it-py==2.0.2 lkml==1.3.1 locket==1.0.0 lockfile==0.12.2 looker-sdk==23.0.0 -lxml==4.9.2 +lxml==4.9.3 lz4==4.3.2 makefun==1.15.1 Mako==1.2.4 -Markdown==3.4.3 -markdown-it-py==2.2.0 -MarkupSafe==2.1.2 -marshmallow==3.19.0 -marshmallow-enum==1.5.1 +Markdown==3.5 +markdown-it-py==3.0.0 +MarkupSafe==2.1.3 +marshmallow==3.20.1 marshmallow-oneofschema==3.0.1 marshmallow-sqlalchemy==0.26.1 matplotlib-inline==0.1.6 -mdit-py-plugins==0.3.5 +mdit-py-plugins==0.4.0 mdurl==0.1.2 -mistune==2.0.5 +mistune==3.0.2 mixpanel==4.10.0 -mmh3==4.0.0 -more-itertools==9.1.0 +mlflow-skinny==2.7.1 +mmh3==4.0.1 +mmhash3==3.0.1 +more-itertools==10.1.0 moreorless==0.4.0 -moto==4.1.10 -msal==1.16.0 -msal-extensions==1.0.0 +moto==4.2.5 +msal==1.22.0 multidict==6.0.4 -murmurhash==1.0.9 -mypy==1.3.0 +murmurhash==1.0.10 +mypy==1.6.0 mypy-extensions==1.0.0 nbclassic==1.0.0 nbclient==0.6.3 -nbconvert==7.4.0 -nbformat==5.8.0 -nest-asyncio==1.5.6 +nbconvert==7.9.2 +nbformat==5.9.1 +nest-asyncio==1.5.8 networkx==3.1 -notebook==6.5.4 +notebook==6.5.6 notebook_shim==0.2.3 -numpy==1.24.3 +numpy==1.26.0 oauthlib==3.2.2 okta==1.7.0 +openlineage-airflow==1.2.0 +openlineage-integration-common==1.2.0 +openlineage-python==1.2.0 +openlineage_sql==1.2.0 openpyxl==3.1.2 +opentelemetry-api==1.20.0 +opentelemetry-exporter-otlp==1.20.0 +opentelemetry-exporter-otlp-proto-common==1.20.0 +opentelemetry-exporter-otlp-proto-grpc==1.20.0 +opentelemetry-exporter-otlp-proto-http==1.20.0 +opentelemetry-proto==1.20.0 +opentelemetry-sdk==1.20.0 +opentelemetry-semantic-conventions==0.41b0 ordered-set==4.1.0 oscrypto==1.3.0 -packaging==23.1 +packaging==23.2 pandas==1.5.3 pandavro==1.5.2 pandocfilters==1.5.0 -parse==1.19.0 +parse==1.19.1 parso==0.8.3 -partd==1.4.0 -pathspec==0.9.0 -pathy==0.10.1 +partd==1.4.1 +pathspec==0.11.2 +pathy==0.10.2 pendulum==2.1.2 pexpect==4.8.0 phonenumbers==8.13.0 pickleshare==0.7.5 -platformdirs==3.5.1 -pluggy==1.0.0 -portalocker==2.7.0 -preshed==3.0.8 +platformdirs==3.11.0 +pluggy==1.3.0 +preshed==3.0.9 prison==0.2.1 progressbar2==4.2.0 -prometheus-client==0.17.0 -prompt-toolkit==3.0.38 -proto-plus==1.22.2 -protobuf==4.23.2 +prometheus-client==0.17.1 +prompt-toolkit==3.0.39 +proto-plus==1.22.3 +protobuf==4.24.4 psutil==5.9.5 -psycopg2-binary==2.9.6 +psycopg2-binary==2.9.9 ptyprocess==0.7.0 pure-eval==0.2.2 pure-sasl==0.6.2 -py-partiql-parser==0.3.0 -pyarrow==8.0.0 +py-partiql-parser==0.3.7 +pyarrow==11.0.0 pyasn1==0.5.0 pyasn1-modules==0.3.0 pyathena==2.4.1 pycountry==22.3.5 pycparser==2.21 -pycryptodome==3.18.0 -pycryptodomex==3.18.0 -pydantic==1.10.8 -pydash==7.0.3 +pycryptodome==3.19.0 +pycryptodomex==3.19.0 +pydantic==1.10.13 +pydash==7.0.6 pydruid==0.6.5 -Pygments==2.15.1 -pymongo==4.3.3 -PyMySQL==1.0.3 -pyOpenSSL==22.0.0 +Pygments==2.16.1 +pyiceberg==0.4.0 +pymongo==4.5.0 +PyMySQL==1.1.0 +pyOpenSSL==23.2.0 pyparsing==3.0.9 -pyrsistent==0.19.3 -pyspnego==0.9.0 +pyspnego==0.10.2 python-daemon==3.0.1 python-dateutil==2.8.2 python-dotenv==1.0.0 @@ -272,111 +280,115 @@ python-jose==3.3.0 python-ldap==3.4.3 python-nvd3==0.15.0 python-slugify==8.0.1 -python-stdnum==1.18 -python-tds==1.12.0 -python-utils==3.6.0 +python-stdnum==1.19 +python-tds==1.13.0 +python-utils==3.8.1 python3-openid==3.2.0 -pytz==2023.3 +pytz==2023.3.post1 pytzdata==2020.1 -PyYAML==6.0 -pyzmq==25.1.0 +PyYAML==6.0.1 +pyzmq==24.0.1 ratelimiter==1.2.0.post0 redash-toolbelt==0.1.9 -redshift-connector==2.0.910 -regex==2023.5.5 -requests==2.28.2 +redshift-connector==2.0.914 +referencing==0.30.2 +regex==2023.10.3 +requests==2.31.0 requests-file==1.5.1 requests-gssapi==1.2.3 requests-ntlm==1.2.0 requests-toolbelt==0.10.1 -responses==0.23.1 -retrying==1.3.4 +responses==0.23.3 rfc3339-validator==0.1.4 rfc3986==2.0.0 -rich==13.3.5 -rich_argparse==1.1.0 +rich==13.6.0 +rich-argparse==1.3.0 +rpds-py==0.10.6 rsa==4.9 ruamel.yaml==0.17.17 -s3transfer==0.6.1 -sasl3==0.2.11 -schwifty==2023.3.0 -scipy==1.10.1 +ruamel.yaml.clib==0.2.8 +s3transfer==0.7.0 +schwifty==2023.9.0 +scipy==1.11.3 scramp==1.4.4 Send2Trash==1.8.2 -setproctitle==1.3.2 -simple-salesforce==1.12.4 +sentry-sdk==1.32.0 +setproctitle==1.3.3 +simple-salesforce==1.12.5 six==1.16.0 -smart-open==6.3.0 -smmap==5.0.0 +smart-open==6.4.0 +smmap==5.0.1 sniffio==1.3.0 -snowflake-connector-python==2.9.0 -snowflake-sqlalchemy==1.4.7 -soupsieve==2.4.1 +snowflake-connector-python==3.2.1 +snowflake-sqlalchemy==1.5.0 +sortedcontainers==2.4.0 +soupsieve==2.5 spacy==3.4.3 spacy-legacy==3.0.12 -spacy-loggers==1.0.4 +spacy-loggers==1.0.5 sql-metadata==2.2.2 -SQLAlchemy==1.4.41 -sqlalchemy-bigquery==1.6.1 +SQLAlchemy==1.4.44 +sqlalchemy-bigquery==1.8.0 SQLAlchemy-JSONField==1.0.1.post0 sqlalchemy-pytds==0.3.5 sqlalchemy-redshift==0.8.14 SQLAlchemy-Utils==0.41.1 -sqlalchemy2-stubs==0.0.2a34 -sqllineage==1.3.6 -sqlparse==0.4.3 -srsly==2.4.6 -stack-data==0.6.2 +sqlalchemy2-stubs==0.0.2a35 +sqllineage==1.3.8 +sqlparse==0.4.4 +srsly==2.4.8 +stack-data==0.6.3 starlette==0.27.0 +strictyaml==1.7.3 tableauserverclient==0.25 tableschema==1.20.2 tabulate==0.9.0 tabulator==1.53.5 -tenacity==8.2.2 +tenacity==8.2.3 termcolor==2.3.0 terminado==0.17.1 text-unidecode==1.3 -thinc==8.1.10 -thrift==0.16.0 +thinc==8.1.12 +thrift==0.13.0 thrift-sasl==0.4.3 tinycss2==1.2.1 toml==0.10.2 tomli==2.0.1 +tomlkit==0.12.1 toolz==0.12.0 -tornado==6.3.2 -tqdm==4.65.0 +tornado==6.3.3 +tqdm==4.66.1 traitlets==5.2.1.post0 -trino==0.324.0 +trino==0.327.0 typeguard==2.13.3 typer==0.7.0 -types-PyYAML==6.0.12.10 +types-PyYAML==6.0.12.12 typing-inspect==0.9.0 -typing_extensions==4.5.0 -tzlocal==5.0.1 +typing_extensions==4.8.0 +tzlocal==5.1 uc-micro-py==1.0.2 -ujson==5.7.0 +ujson==5.8.0 unicodecsv==0.14.1 -urllib3==1.26.16 -uvicorn==0.22.0 +urllib3==1.26.17 +uvicorn==0.23.2 uvloop==0.17.0 -vertica-python==1.3.2 -vertica-sqlalchemy-dialect==0.0.1 +vertica-python==1.3.5 +vertica-sqlalchemy-dialect==0.0.8 vininfo==1.7.0 volatile==2.1.0 wasabi==0.10.1 -watchfiles==0.19.0 -wcmatch==8.4.1 -wcwidth==0.2.6 +watchfiles==0.20.0 +wcmatch==8.5 +wcwidth==0.2.8 webencodings==0.5.1 -websocket-client==1.5.2 +websocket-client==1.6.4 websockets==11.0.3 Werkzeug==2.2.3 -widgetsnbextension==4.0.7 +widgetsnbextension==4.0.9 wrapt==1.15.0 -WTForms==3.0.1 +WTForms==3.1.0 xlrd==2.0.1 xmltodict==0.13.0 yarl==1.9.2 zeep==4.2.1 -zipp==3.15.0 -zstd==1.5.5.1 +zstd==1.5.5.1 \ No newline at end of file diff --git a/docker/datahub-mae-consumer/Dockerfile b/docker/datahub-mae-consumer/Dockerfile index 734f8ba452f3ee..ec3da4de71d157 100644 --- a/docker/datahub-mae-consumer/Dockerfile +++ b/docker/datahub-mae-consumer/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder @@ -18,7 +18,7 @@ FROM alpine:3 AS base ENV JMX_VERSION=0.18.0 # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ - && apk --no-cache add curl bash coreutils sqlite \ + && apk --no-cache add curl bash coreutils sqlite libc6-compat java-snappy \ && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && apk --no-cache add jattach --repository http://dl-cdn.alpinelinux.org/alpine/edge/community/ \ && wget --no-verbose https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \ @@ -26,6 +26,8 @@ RUN apk --no-cache --update-cache --available upgrade \ && cp /usr/lib/jvm/java-11-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks COPY --from=binary /go/bin/dockerize /usr/local/bin +ENV LD_LIBRARY_PATH="/lib:/lib64" + FROM base as prod-install COPY mae-consumer-job.jar /datahub/datahub-mae-consumer/bin/ COPY metadata-models/src/main/resources/entity-registry.yml /datahub/datahub-mae-consumer/resources/entity-registry.yml diff --git a/docker/datahub-mce-consumer/Dockerfile b/docker/datahub-mce-consumer/Dockerfile index ee5d927fb1ddb1..f9c47f77a98f5b 100644 --- a/docker/datahub-mce-consumer/Dockerfile +++ b/docker/datahub-mce-consumer/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder @@ -18,7 +18,7 @@ FROM alpine:3 AS base ENV JMX_VERSION=0.18.0 # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ - && apk --no-cache add curl bash sqlite \ + && apk --no-cache add curl bash sqlite libc6-compat java-snappy \ && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && apk --no-cache add jattach --repository http://dl-cdn.alpinelinux.org/alpine/edge/community/ \ && wget --no-verbose https://github.com/open-telemetry/opentelemetry-java-instrumentation/releases/download/v1.24.0/opentelemetry-javaagent.jar \ @@ -33,6 +33,8 @@ COPY docker/datahub-mce-consumer/start.sh /datahub/datahub-mce-consumer/scripts/ COPY docker/monitoring/client-prometheus-config.yaml /datahub/datahub-mce-consumer/scripts/prometheus-config.yaml RUN chmod +x /datahub/datahub-mce-consumer/scripts/start.sh +ENV LD_LIBRARY_PATH="/lib:/lib64" + FROM base as dev-install # Dummy stage for development. Assumes code is built on your machine and mounted to this image. # See this excellent thread https://github.com/docker/cli/issues/1134 diff --git a/docker/datahub-upgrade/Dockerfile b/docker/datahub-upgrade/Dockerfile index 4e1521cc0561ee..f08e7268e4018b 100644 --- a/docker/datahub-upgrade/Dockerfile +++ b/docker/datahub-upgrade/Dockerfile @@ -1,7 +1,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder @@ -18,7 +18,7 @@ FROM alpine:3 AS base ENV JMX_VERSION=0.18.0 # PFP-260: Upgrade Sqlite to >=3.28.0-r0 to fix https://security.snyk.io/vuln/SNYK-ALPINE39-SQLITE-449762 RUN apk --no-cache --update-cache --available upgrade \ - && apk --no-cache add curl bash coreutils gcompat sqlite \ + && apk --no-cache add curl bash coreutils gcompat sqlite libc6-compat java-snappy \ && apk --no-cache add openjdk11-jre --repository=http://dl-cdn.alpinelinux.org/alpine/edge/community \ && curl -sS https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-runner/9.4.46.v20220331/jetty-runner-9.4.46.v20220331.jar --output jetty-runner.jar \ && curl -sS https://repo1.maven.org/maven2/org/eclipse/jetty/jetty-jmx/9.4.46.v20220331/jetty-jmx-9.4.46.v20220331.jar --output jetty-jmx.jar \ @@ -28,6 +28,8 @@ RUN apk --no-cache --update-cache --available upgrade \ && cp /usr/lib/jvm/java-11-openjdk/jre/lib/security/cacerts /tmp/kafka.client.truststore.jks COPY --from=binary /go/bin/dockerize /usr/local/bin +ENV LD_LIBRARY_PATH="/lib:/lib64" + FROM base as prod-install COPY datahub-upgrade.jar /datahub/datahub-upgrade/bin/ COPY metadata-models/src/main/resources/entity-registry.yml /datahub/datahub-gms/resources/entity-registry.yml diff --git a/docker/docker-compose-with-cassandra.yml b/docker/docker-compose-with-cassandra.yml index 9543e67da07f2a..39f43416005729 100644 --- a/docker/docker-compose-with-cassandra.yml +++ b/docker/docker-compose-with-cassandra.yml @@ -200,7 +200,10 @@ services: retries: 5 timeout: 5s volumes: - - zkdata:/var/lib/zookeeper + # See https://stackoverflow.com/a/61008432 for why we need two volumes. + # See also: https://docs.confluent.io/platform/current/installation/docker/operations/external-volumes.html#data-volumes-for-kafka-and-zk + - zkdata:/var/lib/zookeeper/data + - zklogs:/var/lib/zookeeper/log networks: default: name: datahub_network @@ -210,3 +213,4 @@ volumes: neo4jdata: broker: zkdata: + zklogs: diff --git a/docker/docker-compose-without-neo4j.yml b/docker/docker-compose-without-neo4j.yml index 022362782f7420..235e89e340551b 100644 --- a/docker/docker-compose-without-neo4j.yml +++ b/docker/docker-compose-without-neo4j.yml @@ -174,7 +174,10 @@ services: retries: 3 timeout: 5s volumes: - - zkdata:/var/lib/zookeeper + # See https://stackoverflow.com/a/61008432 for why we need two volumes. + # See also: https://docs.confluent.io/platform/current/installation/docker/operations/external-volumes.html#data-volumes-for-kafka-and-zk + - zkdata:/var/lib/zookeeper/data + - zklogs:/var/lib/zookeeper/log networks: default: name: datahub_network @@ -182,3 +185,4 @@ volumes: esdata: broker: zkdata: + zklogs: diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index a486689e050a21..46da8c6fdbd2ae 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -195,7 +195,10 @@ services: retries: 3 timeout: 5s volumes: - - zkdata:/var/lib/zookeeper + # See https://stackoverflow.com/a/61008432 for why we need two volumes. + # See also: https://docs.confluent.io/platform/current/installation/docker/operations/external-volumes.html#data-volumes-for-kafka-and-zk + - zkdata:/var/lib/zookeeper/data + - zklogs:/var/lib/zookeeper/log networks: default: name: datahub_network @@ -204,3 +207,4 @@ volumes: neo4jdata: broker: zkdata: + zklogs: diff --git a/docker/elasticsearch-setup/Dockerfile b/docker/elasticsearch-setup/Dockerfile index af3c8c9df762ab..c8fb2eba911b8a 100644 --- a/docker/elasticsearch-setup/Dockerfile +++ b/docker/elasticsearch-setup/Dockerfile @@ -3,7 +3,7 @@ # Defining environment ARG APP_ENV=prod -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/kafka-setup/kafka-config.sh b/docker/kafka-setup/kafka-config.sh index 2ba8e2d7c5d47c..4d5698ccc3856f 100644 --- a/docker/kafka-setup/kafka-config.sh +++ b/docker/kafka-setup/kafka-config.sh @@ -2,6 +2,7 @@ : ${PARTITIONS:=1} : ${REPLICATION_FACTOR:=1} +: ${MAX_MESSAGE_BYTES:=5242880} : ${KAFKA_PROPERTIES_SECURITY_PROTOCOL:=PLAINTEXT} @@ -12,3 +13,4 @@ export KAFKA_HEAP_OPTS="-Xmx64M" CONNECTION_PROPERTIES_PATH=/tmp/connection.properties WORKERS=4 +DELIMITER=";" diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh index 629e9bc9484ee1..439ffb4d4d8295 100755 --- a/docker/kafka-setup/kafka-setup.sh +++ b/docker/kafka-setup/kafka-setup.sh @@ -36,7 +36,9 @@ if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SSL" ]]; then fi if [[ -n $KAFKA_PROPERTIES_SSL_TRUSTSTORE_LOCATION ]]; then echo "ssl.truststore.location=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_LOCATION" >> $CONNECTION_PROPERTIES_PATH - echo "ssl.truststore.password=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_PASSWORD" >> $CONNECTION_PROPERTIES_PATH + if [[ $KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE != "PEM" ]]; then + echo "ssl.truststore.password=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_PASSWORD" >> $CONNECTION_PROPERTIES_PATH + fi if [[ -n $KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE ]]; then echo "ssl.truststore.type=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE" >> $CONNECTION_PROPERTIES_PATH fi @@ -100,24 +102,43 @@ exec 4<&- send() { work_id=$1 topic_args=$2 - echo sending $work_id $topic_args - echo "$work_id" "$topic_args" 1>&3 ## the fifo is fd 3 + topic_config=$3 + + echo -e "sending $work_id\n worker_args: ${topic_args}${DELIMITER}${topic_config}" + echo "$work_id" "${topic_args}${DELIMITER}${topic_config}" 1>&3 ## the fifo is fd 3 } ## Produce the jobs to run. -send "$METADATA_AUDIT_EVENT_NAME" "--partitions $PARTITIONS --topic $METADATA_AUDIT_EVENT_NAME" -send "$METADATA_CHANGE_EVENT_NAME" "--partitions $PARTITIONS --topic $METADATA_CHANGE_EVENT_NAME" -send "$FAILED_METADATA_CHANGE_EVENT_NAME" "--partitions $PARTITIONS --topic $FAILED_METADATA_CHANGE_EVENT_NAME" -send "$METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME" "--partitions $PARTITIONS --topic $METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME" +send "$METADATA_AUDIT_EVENT_NAME" "--partitions $PARTITIONS --topic $METADATA_AUDIT_EVENT_NAME" \ + "--entity-type topics --entity-name $METADATA_AUDIT_EVENT_NAME --alter --add-config max.message.bytes=$MAX_MESSAGE_BYTES" + +send "$METADATA_CHANGE_EVENT_NAME" "--partitions $PARTITIONS --topic $METADATA_CHANGE_EVENT_NAME" \ + "--entity-type topics --entity-name $METADATA_CHANGE_EVENT_NAME --alter --add-config max.message.bytes=$MAX_MESSAGE_BYTES" +send "$FAILED_METADATA_CHANGE_EVENT_NAME" "--partitions $PARTITIONS --topic $FAILED_METADATA_CHANGE_EVENT_NAME" \ + "--entity-type topics --entity-name $FAILED_METADATA_CHANGE_EVENT_NAME --alter --add-config max.message.bytes=$MAX_MESSAGE_BYTES" + +send "$METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME" "--partitions $PARTITIONS --topic $METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME" \ + "--entity-type topics --entity-name $METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME --alter --add-config max.message.bytes=$MAX_MESSAGE_BYTES" # Set retention to 90 days -send "$METADATA_CHANGE_LOG_TIMESERIES_TOPIC_NAME" "--partitions $PARTITIONS --config retention.ms=7776000000 --topic $METADATA_CHANGE_LOG_TIMESERIES_TOPIC_NAME" -send "$METADATA_CHANGE_PROPOSAL_TOPIC_NAME" "--partitions $PARTITIONS --topic $METADATA_CHANGE_PROPOSAL_TOPIC_NAME" -send "$FAILED_METADATA_CHANGE_PROPOSAL_TOPIC_NAME" "--partitions $PARTITIONS --topic $FAILED_METADATA_CHANGE_PROPOSAL_TOPIC_NAME" -send "$PLATFORM_EVENT_TOPIC_NAME" "--partitions $PARTITIONS --topic $PLATFORM_EVENT_TOPIC_NAME" +send "$METADATA_CHANGE_LOG_TIMESERIES_TOPIC_NAME" "--partitions $PARTITIONS --config retention.ms=7776000000 --topic $METADATA_CHANGE_LOG_TIMESERIES_TOPIC_NAME" \ + "--entity-type topics --entity-name $METADATA_CHANGE_LOG_TIMESERIES_TOPIC_NAME --alter --add-config max.message.bytes=$MAX_MESSAGE_BYTES" + +send "$METADATA_CHANGE_PROPOSAL_TOPIC_NAME" "--partitions $PARTITIONS --topic $METADATA_CHANGE_PROPOSAL_TOPIC_NAME" \ + "--entity-type topics --entity-name $METADATA_CHANGE_PROPOSAL_TOPIC_NAME --alter --add-config max.message.bytes=$MAX_MESSAGE_BYTES" +send "$FAILED_METADATA_CHANGE_PROPOSAL_TOPIC_NAME" "--partitions $PARTITIONS --topic $FAILED_METADATA_CHANGE_PROPOSAL_TOPIC_NAME" \ + "--entity-type topics --entity-name $FAILED_METADATA_CHANGE_PROPOSAL_TOPIC_NAME --alter --add-config max.message.bytes=$MAX_MESSAGE_BYTES" + +send "$PLATFORM_EVENT_TOPIC_NAME" "--partitions $PARTITIONS --topic $PLATFORM_EVENT_TOPIC_NAME" \ + "--entity-type topics --entity-name $PLATFORM_EVENT_TOPIC_NAME --alter --add-config max.message.bytes=$MAX_MESSAGE_BYTES" # Infinite retention upgrade topic -send "$DATAHUB_UPGRADE_HISTORY_TOPIC_NAME" "--partitions 1 --config retention.ms=-1 --topic $DATAHUB_UPGRADE_HISTORY_TOPIC_NAME" + # Make sure the retention.ms config for $DATAHUB_UPGRADE_HISTORY_TOPIC_NAME is configured to infinite + # Please see the bug report below for details + # https://github.com/datahub-project/datahub/issues/7882 +send "$DATAHUB_UPGRADE_HISTORY_TOPIC_NAME" "--partitions 1 --config retention.ms=-1 --topic $DATAHUB_UPGRADE_HISTORY_TOPIC_NAME" \ + "--entity-type topics --entity-name "$DATAHUB_UPGRADE_HISTORY_TOPIC_NAME" --alter --add-config retention.ms=-1" + # Create topic for datahub usage event if [[ $DATAHUB_ANALYTICS_ENABLED == true ]]; then send "$DATAHUB_USAGE_EVENT_NAME" "--partitions $PARTITIONS --topic $DATAHUB_USAGE_EVENT_NAME" @@ -148,8 +169,3 @@ if [[ $USE_CONFLUENT_SCHEMA_REGISTRY == "TRUE" ]]; then --entity-name _schemas \ --alter --add-config cleanup.policy=compact fi - -# Make sure the retention.ms config for $DATAHUB_UPGRADE_HISTORY_TOPIC_NAME is configured to infinite -# Please see the bug report below for details -# https://github.com/datahub-project/datahub/issues/7882 -kafka-configs.sh --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER --entity-type topics --entity-name "$DATAHUB_UPGRADE_HISTORY_TOPIC_NAME" --alter --add-config retention.ms=-1 diff --git a/docker/kafka-setup/kafka-topic-workers.sh b/docker/kafka-setup/kafka-topic-workers.sh index fd0d45c3f46119..3ddf41abbabf5d 100644 --- a/docker/kafka-setup/kafka-topic-workers.sh +++ b/docker/kafka-setup/kafka-topic-workers.sh @@ -11,10 +11,18 @@ START_LOCK=$4 ## the queue workers are supposed to be doing job() { i=$1 - topic_args=$2 + worker_args=$2 + topic_args=$(echo $worker_args | cut -d "$DELIMITER" -f 1) + topic_config=$(echo $worker_args | cut -d "$DELIMITER" -f 2) + + echo " $i: kafka-topics.sh --create --if-not-exist $topic_args" kafka-topics.sh --create --if-not-exists --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER \ --replication-factor $REPLICATION_FACTOR \ $topic_args + if [[ ! -z "$topic_config" ]]; then + echo " $i: kafka-configs.sh $topic_config" + kafka-configs.sh --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER $topic_config + fi } ## This is the worker to read from the queue. diff --git a/docker/mariadb/init.sql b/docker/mariadb/init.sql index c4132575cf442c..95c8cabbc5ca4a 100644 --- a/docker/mariadb/init.sql +++ b/docker/mariadb/init.sql @@ -28,3 +28,5 @@ insert into metadata_aspect_v2 (urn, aspect, version, metadata, createdon, creat now(), 'urn:li:corpuser:__datahub_system' ); + +DROP TABLE IF EXISTS metadata_index; diff --git a/docker/mysql-setup/Dockerfile b/docker/mysql-setup/Dockerfile index 732b860a58f07f..56bab611804892 100644 --- a/docker/mysql-setup/Dockerfile +++ b/docker/mysql-setup/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/mysql-setup/init.sql b/docker/mysql-setup/init.sql index 2370a971941d21..b789329ddfd179 100644 --- a/docker/mysql-setup/init.sql +++ b/docker/mysql-setup/init.sql @@ -39,3 +39,5 @@ INSERT INTO metadata_aspect_v2 SELECT * FROM temp_metadata_aspect_v2 WHERE NOT EXISTS (SELECT * from metadata_aspect_v2); DROP TABLE temp_metadata_aspect_v2; + +DROP TABLE IF EXISTS metadata_index; diff --git a/docker/mysql/init.sql b/docker/mysql/init.sql index b4b4e4617806c0..aca57d7cd444c0 100644 --- a/docker/mysql/init.sql +++ b/docker/mysql/init.sql @@ -27,3 +27,5 @@ INSERT INTO metadata_aspect_v2 (urn, aspect, version, metadata, createdon, creat now(), 'urn:li:corpuser:__datahub_system' ); + +DROP TABLE IF EXISTS metadata_index; diff --git a/docker/postgres-setup/Dockerfile b/docker/postgres-setup/Dockerfile index 313615ac3465bb..7f4d53ae044d46 100644 --- a/docker/postgres-setup/Dockerfile +++ b/docker/postgres-setup/Dockerfile @@ -1,4 +1,4 @@ -FROM golang:1-alpine3.17 AS binary +FROM golang:1-alpine3.18 AS binary ENV DOCKERIZE_VERSION v0.6.1 WORKDIR /go/src/github.com/jwilder diff --git a/docker/postgres-setup/init.sql b/docker/postgres-setup/init.sql index 12fff7aec7fe6f..72b2f73192e00f 100644 --- a/docker/postgres-setup/init.sql +++ b/docker/postgres-setup/init.sql @@ -35,3 +35,5 @@ INSERT INTO metadata_aspect_v2 SELECT * FROM temp_metadata_aspect_v2 WHERE NOT EXISTS (SELECT * from metadata_aspect_v2); DROP TABLE temp_metadata_aspect_v2; + +DROP TABLE IF EXISTS metadata_index; diff --git a/docker/postgres/init.sql b/docker/postgres/init.sql index cf477c135422e8..87c8dd3337fac9 100644 --- a/docker/postgres/init.sql +++ b/docker/postgres/init.sql @@ -28,3 +28,5 @@ insert into metadata_aspect_v2 (urn, aspect, version, metadata, createdon, creat now(), 'urn:li:corpuser:__datahub_system' ); + +DROP TABLE IF EXISTS metadata_index; diff --git a/docker/quickstart/docker-compose-m1.quickstart.yml b/docker/quickstart/docker-compose-m1.quickstart.yml index 89e9aaa0defd61..3b6d02c83d0f07 100644 --- a/docker/quickstart/docker-compose-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-m1.quickstart.yml @@ -16,6 +16,8 @@ services: - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0 - KAFKA_HEAP_OPTS=-Xms256m -Xmx256m - KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE=false + - KAFKA_MESSAGE_MAX_BYTES=5242880 + - KAFKA_MAX_MESSAGE_BYTES=5242880 healthcheck: interval: 1s retries: 5 @@ -298,7 +300,8 @@ services: ports: - ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181 volumes: - - zkdata:/var/lib/zookeeper + - zkdata:/var/lib/zookeeper/data + - zklogs:/var/lib/zookeeper/log version: '3.9' volumes: broker: null @@ -306,3 +309,4 @@ volumes: mysqldata: null neo4jdata: null zkdata: null + zklogs: null diff --git a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml index f6284edc83648e..e45bafc3da480e 100644 --- a/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j-m1.quickstart.yml @@ -16,6 +16,8 @@ services: - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0 - KAFKA_HEAP_OPTS=-Xms256m -Xmx256m - KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE=false + - KAFKA_MESSAGE_MAX_BYTES=5242880 + - KAFKA_MAX_MESSAGE_BYTES=5242880 healthcheck: interval: 1s retries: 5 @@ -272,10 +274,12 @@ services: ports: - ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181 volumes: - - zkdata:/var/lib/zookeeper + - zkdata:/var/lib/zookeeper/data + - zklogs:/var/lib/zookeeper/log version: '3.9' volumes: broker: null esdata: null mysqldata: null zkdata: null + zklogs: null diff --git a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml index 4e3503e35c0db1..020ef5e9a97b96 100644 --- a/docker/quickstart/docker-compose-without-neo4j.quickstart.yml +++ b/docker/quickstart/docker-compose-without-neo4j.quickstart.yml @@ -16,6 +16,8 @@ services: - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0 - KAFKA_HEAP_OPTS=-Xms256m -Xmx256m - KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE=false + - KAFKA_MESSAGE_MAX_BYTES=5242880 + - KAFKA_MAX_MESSAGE_BYTES=5242880 healthcheck: interval: 1s retries: 5 @@ -272,10 +274,12 @@ services: ports: - ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181 volumes: - - zkdata:/var/lib/zookeeper + - zkdata:/var/lib/zookeeper/data + - zklogs:/var/lib/zookeeper/log version: '3.9' volumes: broker: null esdata: null mysqldata: null zkdata: null + zklogs: null diff --git a/docker/quickstart/docker-compose.quickstart.yml b/docker/quickstart/docker-compose.quickstart.yml index e2f52064389e06..8adc2b9063b840 100644 --- a/docker/quickstart/docker-compose.quickstart.yml +++ b/docker/quickstart/docker-compose.quickstart.yml @@ -16,6 +16,8 @@ services: - KAFKA_GROUP_INITIAL_REBALANCE_DELAY_MS=0 - KAFKA_HEAP_OPTS=-Xms256m -Xmx256m - KAFKA_CONFLUENT_SUPPORT_METRICS_ENABLE=false + - KAFKA_MESSAGE_MAX_BYTES=5242880 + - KAFKA_MAX_MESSAGE_BYTES=5242880 healthcheck: interval: 1s retries: 5 @@ -298,7 +300,8 @@ services: ports: - ${DATAHUB_MAPPED_ZK_PORT:-2181}:2181 volumes: - - zkdata:/var/lib/zookeeper + - zkdata:/var/lib/zookeeper/data + - zklogs:/var/lib/zookeeper/log version: '3.9' volumes: broker: null @@ -306,3 +309,4 @@ volumes: mysqldata: null neo4jdata: null zkdata: null + zklogs: null diff --git a/docs-website/docusaurus.config.js b/docs-website/docusaurus.config.js index 68ea1ebffa6c99..506e263933394a 100644 --- a/docs-website/docusaurus.config.js +++ b/docs-website/docusaurus.config.js @@ -13,6 +13,13 @@ module.exports = { projectName: "datahub", // Usually your repo name. staticDirectories: ["static", "genStatic"], stylesheets: ["https://fonts.googleapis.com/css2?family=Manrope:wght@400;500;700&display=swap"], + scripts: [ + { + src: "https://tools.luckyorange.com/core/lo.js?site-id=28ea8a38", + async: true, + defer: true, + }, + ], noIndex: isSaas, customFields: { isSaas: isSaas, @@ -50,44 +57,41 @@ module.exports = { position: "right", }, { - to: "https://demo.datahubproject.io/", - label: "Demo", - position: "right", - }, - { - href: "https://blog.datahubproject.io/", - label: "Blog", - position: "right", - }, - { - href: "https://feature-requests.datahubproject.io/roadmap", - label: "Roadmap", + type: "dropdown", + label: "Resources", position: "right", + items: [ + { + href: "https://demo.datahubproject.io/", + label: "Demo", + }, + { + href: "https://blog.datahubproject.io/", + label: "Blog", + }, + { + href: "https://feature-requests.datahubproject.io/roadmap", + label: "Roadmap", + }, + { + href: "https://slack.datahubproject.io", + label: "Slack", + }, + { + href: "https://github.com/datahub-project/datahub", + label: "GitHub", + }, + { + href: "https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w", + label: "YouTube", + }, + ], }, { type: "docsVersionDropdown", - position: "right", + position: "left", dropdownActiveClassDisabled: true, }, - { - href: "https://slack.datahubproject.io", - "aria-label": "Slack", - position: "right", - className: "item__icon item__slack", - }, - { - href: "https://github.com/datahub-project/datahub", - "aria-label": "GitHub", - position: "right", - className: "item__icon item__github", - }, - - { - href: "https://www.youtube.com/channel/UC3qFQC5IiwR5fvWEqi_tJ5w", - "aria-label": "YouTube", - position: "right", - className: "item__icon item__youtube", - }, ], }, footer: { diff --git a/docs-website/download_historical_versions.py b/docs-website/download_historical_versions.py index 83157edc1972cf..53ee9cf1e63ef5 100644 --- a/docs-website/download_historical_versions.py +++ b/docs-website/download_historical_versions.py @@ -1,6 +1,7 @@ import json import os import tarfile +import time import urllib.request repo_url = "https://api.github.com/repos/datahub-project/static-assets" @@ -16,17 +17,30 @@ def download_file(url, destination): f.write(chunk) -def fetch_urls(repo_url: str, folder_path: str, file_format: str): +def fetch_urls( + repo_url: str, folder_path: str, file_format: str, max_retries=3, retry_delay=5 +): api_url = f"{repo_url}/contents/{folder_path}" - response = urllib.request.urlopen(api_url) - data = response.read().decode("utf-8") - urls = [ - file["download_url"] - for file in json.loads(data) - if file["name"].endswith(file_format) - ] - print(urls) - return urls + for attempt in range(max_retries + 1): + try: + response = urllib.request.urlopen(api_url) + if response.status == 403 or (500 <= response.status < 600): + raise Exception(f"HTTP Error {response.status}: {response.reason}") + data = response.read().decode("utf-8") + urls = [ + file["download_url"] + for file in json.loads(data) + if file["name"].endswith(file_format) + ] + print(urls) + return urls + except Exception as e: + if attempt < max_retries: + print(f"Attempt {attempt + 1}/{max_retries}: {e}") + time.sleep(retry_delay) + else: + print(f"Max retries reached. Unable to fetch data.") + raise def extract_tar_file(destination_path): diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js index bdf3926c17e0d6..39eaea57444ed1 100644 --- a/docs-website/sidebars.js +++ b/docs-website/sidebars.js @@ -140,6 +140,7 @@ module.exports = { "metadata-ingestion/docs/dev_guides/classification", "metadata-ingestion/docs/dev_guides/add_stateful_ingestion_to_source", "metadata-ingestion/docs/dev_guides/sql_profiles", + "metadata-ingestion/docs/dev_guides/profiling_ingestions", ], }, ], @@ -157,6 +158,7 @@ module.exports = { // The purpose of this section is to provide the minimum steps required to deploy DataHub to the vendor of your choosing "docs/deploy/aws", "docs/deploy/gcp", + "docs/deploy/azure", "docker/README", "docs/deploy/kubernetes", "docs/deploy/environment-vars", @@ -440,10 +442,29 @@ module.exports = { }, "docs/act-on-metadata/impact-analysis", { - Observability: [ - "docs/managed-datahub/observe/freshness-assertions", - "docs/managed-datahub/observe/volume-assertions", - "docs/managed-datahub/observe/custom-sql-assertions", + label: "Observability", + type: "category", + items: [ + { + type: "doc", + id: "docs/managed-datahub/observe/freshness-assertions", + className: "saasOnly", + }, + { + type: "doc", + id: "docs/managed-datahub/observe/volume-assertions", + className: "saasOnly", + }, + { + type: "doc", + id: "docs/managed-datahub/observe/custom-sql-assertions", + className: "saasOnly", + }, + { + type: "doc", + id: "docs/managed-datahub/observe/column-assertions", + className: "saasOnly", + }, ], }, { @@ -603,10 +624,10 @@ module.exports = { { type: "doc", id: "docs/managed-datahub/chrome-extension", - className: "saasOnly", }, { "Managed DataHub Release History": [ + "docs/managed-datahub/release-notes/v_0_2_12", "docs/managed-datahub/release-notes/v_0_2_11", "docs/managed-datahub/release-notes/v_0_2_10", "docs/managed-datahub/release-notes/v_0_2_9", diff --git a/docs-website/src/styles/global.scss b/docs-website/src/styles/global.scss index 55a54876b41acd..16e3893ed08b7d 100644 --- a/docs-website/src/styles/global.scss +++ b/docs-website/src/styles/global.scss @@ -144,20 +144,29 @@ div[class^="announcementBar"] { /** Navbar */ -@media only screen and (max-width: 1050px) { - .navbar__toggle { - display: inherit; - } - .navbar__item { - display: none; - } -} - .navbar { .navbar__logo { height: 3rem; } + + .navbar__link { + align-items: center; + margin: 0 1rem 0; + padding: 0; + border-bottom: 2px solid transparent; + } + + .dropdown > .navbar__link:after { + top: -1px; + border-width: 0.3em 0.3em 0; + margin-left: 0.4em; + } + + .navbar__link--active { + border-bottom-color: var(--ifm-navbar-link-hover-color); + } .navbar__item { + padding: 0.25rem 0; svg[class*="iconExternalLink"] { display: none; } diff --git a/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js b/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js index cc04ab23d3cf37..661d64392e67fe 100644 --- a/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js +++ b/docs-website/src/theme/NavbarItem/DocsVersionDropdownNavbarItem.js @@ -6,6 +6,9 @@ import { translate } from "@docusaurus/Translate"; import { useLocation } from "@docusaurus/router"; import DefaultNavbarItem from "@theme/NavbarItem/DefaultNavbarItem"; import DropdownNavbarItem from "@theme/NavbarItem/DropdownNavbarItem"; + +import styles from "./styles.module.scss"; + const getVersionMainDoc = (version) => version.docs.find((doc) => doc.id === version.mainDocId); export default function DocsVersionDropdownNavbarItem({ mobile, @@ -60,6 +63,7 @@ export default function DocsVersionDropdownNavbarItem({ return (

- ## Add Column-level Lineage @@ -135,12 +133,10 @@ You can now see the lineage between `fct_users_deleted` and `logging_events`. You can now see the column-level lineage between datasets. Note that you have to enable `Show Columns` to be able to see the column-level lineage. -

- ## Read Lineage @@ -180,7 +176,7 @@ query searchAcrossLineage { } ``` -This example shows using lineage degrees as a filter, but additional search filters can be included here as well. +This example shows using lineage degrees as a filter, but additional search filters can be included here as well. @@ -188,7 +184,7 @@ This example shows using lineage degrees as a filter, but additional search filt ```shell curl --location --request POST 'http://localhost:8080/api/graphql' \ --header 'Authorization: Bearer ' \ ---header 'Content-Type: application/json' --data-raw '{ { "query": "mutation searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}" +--header 'Content-Type: application/json' --data-raw '{ { "query": "query searchAcrossLineage { searchAcrossLineage( input: { query: \"*\" urn: \"urn:li:dataset:(urn:li:dataPlatform:dbt,long_tail_companions.adoption.human_profiles,PROD)\" start: 0 count: 10 direction: DOWNSTREAM orFilters: [ { and: [ { condition: EQUAL negated: false field: \"degree\" values: [\"1\", \"2\", \"3+\"] } ] } ] } ) { searchResults { degree entity { urn type } } }}" }}' ``` diff --git a/docs/authentication/README.md b/docs/authentication/README.md index f6eda887844863..ff4a3d83cfde3b 100644 --- a/docs/authentication/README.md +++ b/docs/authentication/README.md @@ -31,8 +31,9 @@ When a user makes a request for Data within DataHub, the request is authenticate and programmatic calls to DataHub APIs. There are two types of tokens that are important: 1. **Session Tokens**: Generated for users of the DataHub web application. By default, having a duration of 24 hours. -These tokens are encoded and stored inside browser-side session cookies. The duration a session token is valid for is configurable via the `AUTH_SESSION_TTL_HOURS` environment variable -on the datahub-frontend deployment. +These tokens are encoded and stored inside browser-side session cookies. The duration a session token is valid for is configurable via the `MAX_SESSION_TOKEN_AGE` environment variable +on the datahub-frontend deployment. Additionally, the `AUTH_SESSION_TTL_HOURS` configures the expiration time of the actor cookie on the user's browser which will also prompt a user login. The difference between these is that the actor cookie expiration only affects the browser session and can still be used programmatically, +but when the session expires it can no longer be used programmatically either as it is created as a JWT with an expiration claim. 2. **Personal Access Tokens**: These are tokens generated via the DataHub settings panel useful for interacting with DataHub APIs. They can be used to automate processes like enriching documentation, ownership, tags, and more on DataHub. Learn more about Personal Access Tokens [here](personal-access-tokens.md). diff --git a/docs/authentication/guides/sso/configure-oidc-react.md b/docs/authentication/guides/sso/configure-oidc-react.md index 512d6adbf916fc..1671673c09318c 100644 --- a/docs/authentication/guides/sso/configure-oidc-react.md +++ b/docs/authentication/guides/sso/configure-oidc-react.md @@ -72,7 +72,8 @@ AUTH_OIDC_BASE_URL=your-datahub-url - `AUTH_OIDC_CLIENT_SECRET`: Unique client secret received from identity provider - `AUTH_OIDC_DISCOVERY_URI`: Location of the identity provider OIDC discovery API. Suffixed with `.well-known/openid-configuration` - `AUTH_OIDC_BASE_URL`: The base URL of your DataHub deployment, e.g. https://yourorgdatahub.com (prod) or http://localhost:9002 (testing) -- `AUTH_SESSION_TTL_HOURS`: The length of time in hours before a user will be prompted to login again. Session tokens are stateless so this determines at what time a session token may no longer be used and a valid session token can be used until this time has passed. +- `AUTH_SESSION_TTL_HOURS`: The length of time in hours before a user will be prompted to login again. Controls the actor cookie expiration time in the browser. Numeric value converted to hours, default 24. +- `MAX_SESSION_TOKEN_AGE`: Determines the expiration time of a session token. Session tokens are stateless so this determines at what time a session token may no longer be used and a valid session token can be used until this time has passed. Accepts a valid relative Java date style String, default 24h. Providing these configs will cause DataHub to delegate authentication to your identity provider, requesting the "oidc email profile" scopes and parsing the "preferred_username" claim from diff --git a/docs/authorization/policies.md b/docs/authorization/policies.md index e3606f2a3e48d3..63aa6688d3eecf 100644 --- a/docs/authorization/policies.md +++ b/docs/authorization/policies.md @@ -137,7 +137,7 @@ We currently support the following: #### Resources Resource filter defines the set of resources that the policy applies to is defined using a list of criteria. Each -criterion defines a field type (like resource_type, resource_urn, domain), a list of field values to compare, and a +criterion defines a field type (like type, urn, domain), a list of field values to compare, and a condition (like EQUALS). It essentially checks whether the field of a certain resource matches any of the input values. Note, that if there are no criteria or resource is not set, policy is applied to ALL resources. @@ -149,7 +149,7 @@ For example, the following resource filter will apply the policy to datasets, ch "filter": { "criteria": [ { - "field": "RESOURCE_TYPE", + "field": "TYPE", "condition": "EQUALS", "values": [ "dataset", @@ -175,8 +175,8 @@ Supported fields are as follows | Field Type | Description | Example | |---------------|------------------------|-------------------------| -| resource_type | Type of the resource | dataset, chart, dataJob | -| resource_urn | Urn of the resource | urn:li:dataset:... | +| type | Type of the resource | dataset, chart, dataJob | +| urn | Urn of the resource | urn:li:dataset:... | | domain | Domain of the resource | urn:li:domain:domainX | ## Managing Policies diff --git a/docs/datahub_lite.md b/docs/datahub_lite.md index de0a20eed1d01f..55491e3b998cf2 100644 --- a/docs/datahub_lite.md +++ b/docs/datahub_lite.md @@ -85,9 +85,10 @@ source: sink: type: datahub-lite - forward_to: - type: datahub-rest - config: + config: + forward_to: + type: datahub-rest + config: server: "http://datahub-gms:8080" ``` diff --git a/docs/deploy/aws.md b/docs/deploy/aws.md index 228fcb51d1a28f..6598b93c25e9a1 100644 --- a/docs/deploy/aws.md +++ b/docs/deploy/aws.md @@ -15,7 +15,7 @@ This guide requires the following tools: - [kubectl](https://kubernetes.io/docs/tasks/tools/) to manage kubernetes resources - [helm](https://helm.sh/docs/intro/install/) to deploy the resources based on helm charts. Note, we only support Helm 3. -- [eksctl](https://eksctl.io/introduction/#installation) to create and manage clusters on EKS +- [eksctl](https://eksctl.io/installation/) to create and manage clusters on EKS - [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) to manage AWS resources To use the above tools, you need to set up AWS credentials by following @@ -100,7 +100,7 @@ eksctl create iamserviceaccount \ Install the TargetGroupBinding custom resource definition by running the following. ``` -kubectl apply -k "github.com/aws/eks-charts/stable/aws-load-balancer-controller//crds?ref=master" +kubectl apply -k "github.com/aws/eks-charts/stable/aws-load-balancer-controller/crds?ref=master" ``` Add the helm chart repository containing the latest version of the ALB controller. diff --git a/docs/deploy/azure.md b/docs/deploy/azure.md new file mode 100644 index 00000000000000..b940b82827e947 --- /dev/null +++ b/docs/deploy/azure.md @@ -0,0 +1,234 @@ +--- +title: "Deploying to Azure" +--- + +# Azure setup guide + +The following is a set of instructions to quickstart DataHub on Azure Kubernetes Service (AKS). Note, the guide +assumes that you do not have a Kubernetes cluster set up. + +## Prerequisites + +This guide requires the following tools: + +- [kubectl](https://kubernetes.io/docs/tasks/tools/) to manage Kubernetes resources +- [helm](https://helm.sh/docs/intro/install/) to deploy the resources based on helm charts. Note, we only support Helm + 3. +- [AZ CLI](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli) to manage Azure resources + +To use the above tools, you need to set up Azure credentials by following +this [guide](https://learn.microsoft.com/en-us/cli/azure/authenticate-azure-cli). + +## Start up a Kubernetes cluster on AKS + +You can follow this [guide](https://learn.microsoft.com/en-us/azure/aks/learn/quick-kubernetes-deploy-cli) to create a new +cluster using az cli. + +Note: you can skip the application deployment step since we are deploying DataHub instead. If you are deploying DataHub to an existing cluster, please +skip the corresponding sections. + +- Verify you have the Microsoft.OperationsManagement and Microsoft.OperationalInsights providers registered on your subscription. These Azure resource providers are required to support Container insights. Check the registration status using the following commands: + +``` +az provider show -n Microsoft.OperationsManagement -o table +az provider show -n Microsoft.OperationalInsights -o table +``` + +If they're not registered, register them using the following commands: + +``` +az provider register --namespace Microsoft.OperationsManagement +az provider register --namespace Microsoft.OperationalInsights +``` + +- Create a resource group. Change name, location to your choosing. + +``` +az group create --name myResourceGroup --location eastus +``` + +The following output indicates that the command execution was successful: + +``` +{ + "id": "/subscriptions//resourceGroups/myResourceGroup", + "location": "eastus", + "managedBy": null, + "name": "myResourceGroup", + "properties": { + "provisioningState": "Succeeded" + }, + "tags": null +} +``` +- Create an AKS Cluster. For this project, it is best to increase node count to at least 3. Change cluster name, node count, and addons to your choosing. + +``` +az aks create -g myResourceGroup -n myAKSCluster --enable-managed-identity --node-count 3 --enable-addons monitoring --generate-ssh-keys +``` + +After a few minutes, the command completes and returns JSON-formatted information about the cluster. + +- Connect to the cluster + +Configure kubectl to connect to your Kubernetes cluster using the az aks get-credentials command. + +``` +az aks get-credentials --resource-group myResourceGroup --name myAKSCluster +``` + +Verify the connection to your cluster using the `kubectl get` command. This command returns a list of the cluster nodes. + +``` +kubectl get nodes +``` + +You should get results like below. Make sure node status is Ready. + +``` +NAME STATUS ROLES AGE VERSION +aks-nodepool1-37660971-vmss000000 Ready agent 24h v1.25.6 +aks-nodepool1-37660971-vmss000001 Ready agent 24h v1.25.6 +aks-nodepool1-37660971-vmss000002 Ready agent 24h v1.25.6 +``` + +## Setup DataHub using Helm + +Once the Kubernetes cluster has been set up, you can deploy DataHub and its prerequisites using helm. Please follow the +steps in this [guide](kubernetes.md). + + +Notes: +Since we are using PostgreSQL as the storage layer, change postgresql enabled to true and mysql to false in the values.yaml file of prerequisites. +Additionally, create a postgresql secret. Make sure to include 3 passwords for the postgresql secret: postgres-password, replication-password, and password. + +## Expose endpoints using a load balancer + +Now that all the pods are up and running, you need to expose the datahub-frontend end point by setting +up [ingress](https://kubernetes.io/docs/concepts/services-networking/ingress/). To do this, you need to first set up an +ingress controller. + + +There are many [ingress controllers](https://kubernetes.io/docs/concepts/services-networking/ingress-controllers/) to choose +from, but here, we will follow this [guide](https://learn.microsoft.com/en-us/azure/application-gateway/tutorial-ingress-controller-add-on-existing) to set up the Azure +Application Gateway Ingress Controller. + +- Deploy a New Application Gateway. + +First, you need to create a WAF policy + +``` +az network application-gateway waf-policy create -g myResourceGroup -n myWAFPolicy +``` + +- Before the application gateway can be deployed, you'll also need to create a public IP resource, a new virtual network with address space 10.0.0.0/16, and a subnet with address space 10.0.0.0/24. +Then, you can deploy your application gateway in the subnet using the publicIP. + +Caution: When you use an AKS cluster and application gateway in separate virtual networks, the address spaces of the two virtual networks must not overlap. The default address space that an AKS cluster deploys in is 10.224.0.0/12. + + +``` +az network public-ip create -n myPublicIp -g myResourceGroup --allocation-method Static --sku Standard +az network vnet create -n myVnet -g myResourceGroup --address-prefix 10.0.0.0/16 --subnet-name mySubnet --subnet-prefix 10.0.0.0/24 +az network application-gateway create -n myApplicationGateway -l eastus -g myResourceGroup --sku WAF_v2 --public-ip-address myPublicIp --vnet-name myVnet --subnet mySubnet --priority 100 --waf-policy /subscriptions/{subscription_id}/resourceGroups/myResourceGroup/providers/Microsoft.Network/ApplicationGatewayWebApplicationFirewallPolicies/myWAFPolicy +``` +Change myPublicIp, myResourceGroup, myVnet, mySubnet, and myApplicationGateway to names of your choosing. + + +- Enable the AGIC Add-On in Existing AKS Cluster Through Azure CLI + +``` +appgwId=$(az network application-gateway show -n myApplicationGateway -g myResourceGroup -o tsv --query "id") +az aks enable-addons -n myCluster -g myResourceGroup -a ingress-appgw --appgw-id $appgwId +``` + +- Peer the Two Virtual Networks Together + +Since you deployed the AKS cluster in its own virtual network and the Application gateway in another virtual network, you'll need to peer the two virtual networks together in order for traffic to flow from the Application gateway to the pods in the cluster. + +``` +nodeResourceGroup=$(az aks show -n myCluster -g myResourceGroup -o tsv --query "nodeResourceGroup") +aksVnetName=$(az network vnet list -g $nodeResourceGroup -o tsv --query "[0].name") + +aksVnetId=$(az network vnet show -n $aksVnetName -g $nodeResourceGroup -o tsv --query "id") +az network vnet peering create -n AppGWtoAKSVnetPeering -g myResourceGroup --vnet-name myVnet --remote-vnet $aksVnetId --allow-vnet-access + +appGWVnetId=$(az network vnet show -n myVnet -g myResourceGroup -o tsv --query "id") +az network vnet peering create -n AKStoAppGWVnetPeering -g $nodeResourceGroup --vnet-name $aksVnetName --remote-vnet $appGWVnetId --allow-vnet-access +``` + +- Deploy the Ingress on the Frontend Pod + +In order to use the ingress controller to expose frontend pod, we need to update the datahub-frontend section of the values.yaml file that was used to deploy DataHub. Here is a sample configuration: + +``` +datahub-frontend: + enabled: true + image: + repository: linkedin/datahub-frontend-react + # tag: "v0.10.0 # defaults to .global.datahub.version + + # Set up ingress to expose react front-end + ingress: + enabled: true + annotations: + kubernetes.io/ingress.class: azure/application-gateway + appgw.ingress.kubernetes.io/backend-protocol: "http" + + hosts: + - paths: + - /* + defaultUserCredentials: {} +``` + +You can then apply the updates: + +``` +helm upgrade --install datahub datahub/datahub --values values.yaml +``` + +You can now verify that the ingress was created correctly + +``` +kubectl get ingress +``` + +You should see a result like this: + +![frontend-image](https://github.com/Saketh-Mahesh/azure-docs-images/blob/main/frontend-status.png?raw=true) + +## Use PostgresSQL for the storage layer +Configure a PostgreSQL database in the same virtual network as the Kubernetes cluster or implement virtual network peering to connect both networks. Once the database is provisioned, you should be able to see the following page under the Connect tab on the left side. + + +Note: PostgreSQL Database MUST be deployed in same location as AKS/resource group (eastus, centralus, etc.) +Take a note of the connection details: + +![postgres-info](https://github.com/Saketh-Mahesh/azure-docs-images/blob/main/postgres-info.png?raw=true) + + + + + +- Update the postgresql settings under global in the values.yaml as follows. + +``` +global: + sql: + datasource: + host: "${POSTGRES_HOST}.postgres.database.azure.com:5432" + hostForpostgresqlClient: "${POSTGRES_HOST}.postgres.database.azure.com" + port: "5432" + url: "jdbc:postgresql://${POSTGRES_HOST}.postgres.database.azure.com:5432/datahub?user=${POSTGRES_ADMIN_LOGIN}&password=${POSTGRES_ADMIN_PASSWORD}&sslmode=require" + driver: "org.postgresql.Driver" + username: "${POSTGRES_ADMIN_LOGIN}" + password: + value: "${POSTGRES_ADMIN_PASSWORD}" +``` +Run this command helm command to update datahub configuration + +``` +helm upgrade --install datahub datahub/datahub --values values.yaml +``` + +And there you go! You have now installed DataHub on an Azure Kubernetes Cluster with an ingress controller set up to expose the frontend. Additionally you have utilized PostgreSQL as the storage layer of DataHub. \ No newline at end of file diff --git a/docs/deploy/confluent-cloud.md b/docs/deploy/confluent-cloud.md index 794b55d4686bfb..096fd9984f474d 100644 --- a/docs/deploy/confluent-cloud.md +++ b/docs/deploy/confluent-cloud.md @@ -16,6 +16,11 @@ First, you'll need to create following new topics in the [Confluent Control Cent 6. (Deprecated) **MetadataChangeEvent_v4**: Metadata change proposal messages 7. (Deprecated) **MetadataAuditEvent_v4**: Metadata change log messages 8. (Deprecated) **FailedMetadataChangeEvent_v4**: Failed to process #1 event +9. **MetadataGraphEvent_v4**: +10. **MetadataGraphEvent_v4**: +11. **PlatformEvent_v1** +12. **DataHubUpgradeHistory_v1**: Notifies the end of DataHub Upgrade job so dependants can act accordingly (_eg_, startup). + Note this topic requires special configuration: **Infinite retention**. Also, 1 partition is enough for the occasional traffic. The first five are the most important, and are explained in more depth in [MCP/MCL](../advanced/mcp-mcl.md). The final topics are those which are deprecated but still used under certain circumstances. It is likely that in the future they will be completely diff --git a/docs/deploy/environment-vars.md b/docs/deploy/environment-vars.md index 0689db9b173310..4c7b249349ca01 100644 --- a/docs/deploy/environment-vars.md +++ b/docs/deploy/environment-vars.md @@ -67,21 +67,26 @@ In general, there are **lots** of Kafka configuration environment variables for These environment variables follow the standard Spring representation of properties as environment variables. Simply replace the dot, `.`, with an underscore, `_`, and convert to uppercase. -| Variable | Default | Unit/Type | Components | Description | -|-----------------------------------------------------|----------------------------------------------|-----------|-----------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `KAFKA_LISTENER_CONCURRENCY` | 1 | integer | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Number of Kafka consumer threads. Optimize throughput by matching to topic partitions. | -| `SPRING_KAFKA_PRODUCER_PROPERTIES_MAX_REQUEST_SIZE` | 1048576 | bytes | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Max produced message size. Note that the topic configuration is not controlled by this variable. | -| `SCHEMA_REGISTRY_TYPE` | `INTERNAL` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Schema registry implementation. One of `INTERNAL` or `KAFKA` or `AWS_GLUE` | -| `KAFKA_SCHEMAREGISTRY_URL` | `http://localhost:8080/schema-registry/api/` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Schema registry url. Used for `INTERNAL` and `KAFKA`. The default value is for the `GMS` component. The `MCE Consumer` and `MAE Consumer` should be the `GMS` hostname and port. | -| `AWS_GLUE_SCHEMA_REGISTRY_REGION` | `us-east-1` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | If using `AWS_GLUE` in the `SCHEMA_REGISTRY_TYPE` variable for the schema registry implementation. | -| `AWS_GLUE_SCHEMA_REGISTRY_NAME` | `` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | If using `AWS_GLUE` in the `SCHEMA_REGISTRY_TYPE` variable for the schema registry. | -| `USE_CONFLUENT_SCHEMA_REGISTRY` | `true` | boolean | [`kafka-setup`] | Enable Confluent schema registry configuration. | +| Variable | Default | Unit/Type | Components | Description | +|-----------------------------------------------------|----------------------------------------------|-----------|--------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `KAFKA_LISTENER_CONCURRENCY` | 1 | integer | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Number of Kafka consumer threads. Optimize throughput by matching to topic partitions. | +| `SPRING_KAFKA_PRODUCER_PROPERTIES_MAX_REQUEST_SIZE` | 1048576 | bytes | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Max produced message size. Note that the topic configuration is not controlled by this variable. | +| `SCHEMA_REGISTRY_TYPE` | `INTERNAL` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Schema registry implementation. One of `INTERNAL` or `KAFKA` or `AWS_GLUE` | +| `KAFKA_SCHEMAREGISTRY_URL` | `http://localhost:8080/schema-registry/api/` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | Schema registry url. Used for `INTERNAL` and `KAFKA`. The default value is for the `GMS` component. The `MCE Consumer` and `MAE Consumer` should be the `GMS` hostname and port. | +| `AWS_GLUE_SCHEMA_REGISTRY_REGION` | `us-east-1` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | If using `AWS_GLUE` in the `SCHEMA_REGISTRY_TYPE` variable for the schema registry implementation. | +| `AWS_GLUE_SCHEMA_REGISTRY_NAME` | `` | string | [`GMS`, `MCE Consumer`, `MAE Consumer`] | If using `AWS_GLUE` in the `SCHEMA_REGISTRY_TYPE` variable for the schema registry. | +| `USE_CONFLUENT_SCHEMA_REGISTRY` | `true` | boolean | [`kafka-setup`] | Enable Confluent schema registry configuration. | +| `KAFKA_PRODUCER_MAX_REQUEST_SIZE` | `5242880` | integer | [`Frontend`, `GMS`, `MCE Consumer`, `MAE Consumer`] | Max produced message size. Note that the topic configuration is not controlled by this variable. | +| `KAFKA_CONSUMER_MAX_PARTITION_FETCH_BYTES` | `5242880` | integer | [`GMS`, `MCE Consumer`, `MAE Consumer`] | The maximum amount of data per-partition the server will return. Records are fetched in batches by the consumer. If the first record batch in the first non-empty partition of the fetch is larger than this limit, the batch will still be returned to ensure that the consumer can make progress. | +| `MAX_MESSAGE_BYTES` | `5242880` | integer | [`kafka-setup`] | Sets the max message size on the kakfa topics. | +| `KAFKA_PRODUCER_COMPRESSION_TYPE` | `snappy` | string | [`Frontend`, `GMS`, `MCE Consumer`, `MAE Consumer`] | The compression used by the producer. | ## Frontend -| Variable | Default | Unit/Type | Components | Description | -|------------------------------------|----------|-----------|--------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `AUTH_VERBOSE_LOGGING` | `false` | boolean | [`Frontend`] | Enable verbose authentication logging. Enabling this will leak sensisitve information in the logs. Disable when finished debugging. | -| `AUTH_OIDC_GROUPS_CLAIM` | `groups` | string | [`Frontend`] | Claim to use as the user's group. | -| `AUTH_OIDC_EXTRACT_GROUPS_ENABLED` | `false` | boolean | [`Frontend`] | Auto-provision the group from the user's group claim. | -| `AUTH_SESSION_TTL_HOURS` | `24` | string | [`Frontend`] | The number of hours a user session is valid. [User session tokens are stateless and will become invalid after this time](https://www.playframework.com/documentation/2.8.x/SettingsSession#Session-Timeout-/-Expiration) requiring a user to login again. | \ No newline at end of file +| Variable | Default | Unit/Type | Components | Description | +|------------------------------------|----------|-----------|---------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `AUTH_VERBOSE_LOGGING` | `false` | boolean | [`Frontend`] | Enable verbose authentication logging. Enabling this will leak sensisitve information in the logs. Disable when finished debugging. | +| `AUTH_OIDC_GROUPS_CLAIM` | `groups` | string | [`Frontend`] | Claim to use as the user's group. | +| `AUTH_OIDC_EXTRACT_GROUPS_ENABLED` | `false` | boolean | [`Frontend`] | Auto-provision the group from the user's group claim. | +| `AUTH_SESSION_TTL_HOURS` | `24` | string | [`Frontend`] | The number of hours a user session is valid. After this many hours the actor cookie will be expired by the browser and the user will be prompted to login again. | +| `MAX_SESSION_TOKEN_AGE` | `24h` | string | [`Frontend`] | The maximum age of the session token. [User session tokens are stateless and will become invalid after this time](https://www.playframework.com/documentation/2.8.x/SettingsSession#Session-Timeout-/-Expiration) requiring a user to login again. | \ No newline at end of file diff --git a/docs/dev-guides/timeline.md b/docs/dev-guides/timeline.md index 829aef1d3eefa1..6a8e158d40ebf0 100644 --- a/docs/dev-guides/timeline.md +++ b/docs/dev-guides/timeline.md @@ -228,7 +228,7 @@ http://localhost:8080/openapi/timeline/v1/urn%3Ali%3Adataset%3A%28urn%3Ali%3Adat REMOVE GLOSSARY_TERM dataset:hive:testTimelineDataset (urn:li:glossaryTerm:SavingsAccount): The GlossaryTerm 'SavingsAccount' for the entity 'urn:li:dataset:(urn:li:dataPlatform:hive,testTimelineDataset,PROD)' has been removed. ``` -# Explore the API +## Explore the API The API is browse-able via the UI through through the dropdown. Here are a few screenshots showing how to navigate to it. You can try out the API and send example requests. @@ -243,7 +243,7 @@ Here are a few screenshots showing how to navigate to it. You can try out the AP

-# Future Work +## Future Work - Supporting versions as start and end parameters as part of the call to the timeline API - Supporting entities beyond Datasets diff --git a/docs/how/kafka-config.md b/docs/how/kafka-config.md index f3f81c3d07c014..2f20e8b548f835 100644 --- a/docs/how/kafka-config.md +++ b/docs/how/kafka-config.md @@ -52,16 +52,21 @@ Also see [Kafka Connect Security](https://docs.confluent.io/current/connect/secu By default, DataHub relies on the a set of Kafka topics to operate. By default, they have the following names: -- **MetadataChangeProposal_v1** -- **FailedMetadataChangeProposal_v1** -- **MetadataChangeLog_Versioned_v1** -- **MetadataChangeLog_Timeseries_v1** -- **DataHubUsageEvent_v1**: User behavior tracking event for UI +1. **MetadataChangeProposal_v1** +2. **FailedMetadataChangeProposal_v1** +3. **MetadataChangeLog_Versioned_v1** +4. **MetadataChangeLog_Timeseries_v1** +5. **DataHubUsageEvent_v1**: User behavior tracking event for UI 6. (Deprecated) **MetadataChangeEvent_v4**: Metadata change proposal messages 7. (Deprecated) **MetadataAuditEvent_v4**: Metadata change log messages 8. (Deprecated) **FailedMetadataChangeEvent_v4**: Failed to process #1 event +9. **MetadataGraphEvent_v4**: +10. **MetadataGraphEvent_v4**: +11. **PlatformEvent_v1**: +12. **DataHubUpgradeHistory_v1**: Notifies the end of DataHub Upgrade job so dependants can act accordingly (_eg_, startup). + Note this topic requires special configuration: **Infinite retention**. Also, 1 partition is enough for the occasional traffic. -These topics are discussed at more length in [Metadata Events](../what/mxe.md). +How Metadata Events relate to these topics is discussed at more length in [Metadata Events](../what/mxe.md). We've included environment variables to customize the name each of these topics, for cases where an organization has naming rules for your topics. diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index 9b19291ee246ae..28f11e4b6d7072 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -5,13 +5,76 @@ This file documents any backwards-incompatible changes in DataHub and assists pe ## Next ### Breaking Changes + +### Potential Downtime + +### Deprecations + +### Other Notable Changes + +## 0.12.0 + +### Breaking Changes + +- #9044 - GraphQL APIs for adding ownership now expect either an `ownershipTypeUrn` referencing a customer ownership type or a (deprecated) `type`. Where before adding an ownership without a concrete type was allowed, this is no longer the case. For simplicity you can use the `type` parameter which will get translated to a custom ownership type internally if one exists for the type being added. +- #9010 - In Redshift source's config `incremental_lineage` is set default to off. - #8810 - Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now. +- #8942 - Removed `urn:li:corpuser:datahub` owner for the `Measure`, `Dimension` and `Temporal` tags emitted + by Looker and LookML source connectors. +- #8853 - The Airflow plugin no longer supports Airflow 2.0.x or Python 3.7. See the docs for more details. +- #8853 - Introduced the Airflow plugin v2. If you're using Airflow 2.3+, the v2 plugin will be enabled by default, and so you'll need to switch your requirements to include `pip install 'acryl-datahub-airflow-plugin[plugin-v2]'`. To continue using the v1 plugin, set the `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN` environment variable to `true`. +- #8943 - The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled. +This is currently enabled by default to preserve compatibility, but will be disabled by default and then removed in the future. +If stateful ingestion is enabled, simply setting `include_metastore: false` will perform all required cleanup. +Otherwise, we recommend soft deleting all databricks data via the DataHub CLI: +`datahub delete --platform databricks --soft` and then reingesting with `include_metastore: false`. +- #8846 - Changed enum values in resource filters used by policies. `RESOURCE_TYPE` became `TYPE` and `RESOURCE_URN` became `URN`. +Any existing policies using these filters (i.e. defined for particular `urns` or `types` such as `dataset`) need to be upgraded +manually, for example by retrieving their respective `dataHubPolicyInfo` aspect and changing part using filter i.e. +```yaml + "resources": { + "filter": { + "criteria": [ + { + "field": "RESOURCE_TYPE", + "condition": "EQUALS", + "values": [ + "dataset" + ] + } + ] + } +``` +into +```yaml + "resources": { + "filter": { + "criteria": [ + { + "field": "TYPE", + "condition": "EQUALS", + "values": [ + "dataset" + ] + } + ] + } +``` +for example, using `datahub put` command. Policies can be also removed and re-created via UI. +- #9077 - The BigQuery ingestion source by default sets `match_fully_qualified_names: true`. +This means that any `dataset_pattern` or `schema_pattern` specified will be matched on the fully +qualified dataset name, i.e. `.`. We attempt to support the old +pattern format by prepending `.*\\.` to dataset patterns lacking a period, so in most cases this +should not cause any issues. However, if you have a complex dataset pattern, we recommend you +manually convert it to the fully qualified format to avoid any potential issues. ### Potential Downtime ### Deprecations ### Other Notable Changes +- Session token configuration has changed, all previously created session tokens will be invalid and users will be prompted to log in. Expiration time has also been shortened which may result in more login prompts with the default settings. + There should be no other interruption due to this change. ## 0.11.0 diff --git a/docs/lineage/airflow.md b/docs/lineage/airflow.md index 49de5352f6d58c..19ed1598d4c5a1 100644 --- a/docs/lineage/airflow.md +++ b/docs/lineage/airflow.md @@ -1,74 +1,137 @@ # Airflow Integration -DataHub supports integration of +:::note -- Airflow Pipeline (DAG) metadata -- DAG and Task run information as well as -- Lineage information when present +If you're looking to schedule DataHub ingestion using Airflow, see the guide on [scheduling ingestion with Airflow](../../metadata-ingestion/schedule_docs/airflow.md). -You can use either the DataHub Airflow lineage plugin (recommended) or the Airflow lineage backend (deprecated). +::: -## Using Datahub's Airflow lineage plugin +The DataHub Airflow plugin supports: -:::note +- Automatic column-level lineage extraction from various operators e.g. `SqlOperator`s (including `MySqlOperator`, `PostgresOperator`, `SnowflakeOperator`, and more), `S3FileTransformOperator`, and a few others. +- Airflow DAG and tasks, including properties, ownership, and tags. +- Task run information, including task successes and failures. +- Manual lineage annotations using `inlets` and `outlets` on Airflow operators. -The Airflow lineage plugin is only supported with Airflow version >= 2.0.2 or on MWAA with an Airflow version >= 2.0.2. +There's two actively supported implementations of the plugin, with different Airflow version support. -If you're using Airflow 1.x, use the Airflow lineage plugin with acryl-datahub-airflow-plugin <= 0.9.1.0. +| Approach | Airflow Version | Notes | +| --------- | --------------- | --------------------------------------------------------------------------- | +| Plugin v2 | 2.3+ | Recommended. Requires Python 3.8+ | +| Plugin v1 | 2.1+ | No automatic lineage extraction; may not extract lineage if the task fails. | -::: +If you're using Airflow older than 2.1, it's possible to use the v1 plugin with older versions of `acryl-datahub-airflow-plugin`. See the [compatibility section](#compatibility) for more details. -This plugin registers a task success/failure callback on every task with a cluster policy and emits DataHub events from that. This allows this plugin to be able to register both task success as well as failures compared to the older Airflow Lineage Backend which could only support emitting task success. + + -### Setup +## DataHub Plugin v2 -1. You need to install the required dependency in your airflow. +### Installation + +The v2 plugin requires Airflow 2.3+ and Python 3.8+. If you don't meet these requirements, use the v1 plugin instead. ```shell -pip install acryl-datahub-airflow-plugin +pip install 'acryl-datahub-airflow-plugin[plugin-v2]' ``` -:::note +### Configuration -The [DataHub Rest](../../metadata-ingestion/sink_docs/datahub.md#datahub-rest) emitter is included in the plugin package by default. To use [DataHub Kafka](../../metadata-ingestion/sink_docs/datahub.md#datahub-kafka) install `pip install acryl-datahub-airflow-plugin[datahub-kafka]`. +Set up a DataHub connection in Airflow. -::: +```shell +airflow connections add --conn-type 'datahub-rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080' --conn-password '' +``` + +No additional configuration is required to use the plugin. However, there are some optional configuration parameters that can be set in the `airflow.cfg` file. + +```ini title="airflow.cfg" +[datahub] +# Optional - additional config here. +enabled = True # default +``` + +| Name | Default value | Description | +| -------------------------- | -------------------- | ---------------------------------------------------------------------------------------- | +| enabled | true | If the plugin should be enabled. | +| conn_id | datahub_rest_default | The name of the datahub rest connection. | +| cluster | prod | name of the airflow cluster | +| capture_ownership_info | true | Extract DAG ownership. | +| capture_tags_info | true | Extract DAG tags. | +| capture_executions | true | Extract task runs and success/failure statuses. This will show up in DataHub "Runs" tab. | +| enable_extractors | true | Enable automatic lineage extraction. | +| disable_openlineage_plugin | true | Disable the OpenLineage plugin to avoid duplicative processing. | +| log_level | _no change_ | [debug] Set the log level for the plugin. | +| debug_emitter | false | [debug] If true, the plugin will log the emitted events. | + +### Automatic lineage extraction + +To automatically extract lineage information, the v2 plugin builds on top of Airflow's built-in [OpenLineage extractors](https://openlineage.io/docs/integrations/airflow/default-extractors). -2. Disable lazy plugin loading in your airflow.cfg. - On MWAA you should add this config to your [Apache Airflow configuration options](https://docs.aws.amazon.com/mwaa/latest/userguide/configuring-env-variables.html#configuring-2.0-airflow-override). +The SQL-related extractors have been updated to use DataHub's SQL parser, which is more robust than the built-in one and uses DataHub's metadata information to generate column-level lineage. We discussed the DataHub SQL parser, including why schema-aware parsing works better and how it performs on benchmarks, during the [June 2023 community town hall](https://youtu.be/1QVcUmRQK5E?si=U27zygR7Gi_KdkzE&t=2309). + +## DataHub Plugin v1 + +### Installation + +The v1 plugin requires Airflow 2.1+ and Python 3.8+. If you're on older versions, it's still possible to use an older version of the plugin. See the [compatibility section](#compatibility) for more details. + +If you're using Airflow 2.3+, we recommend using the v2 plugin instead. If you need to use the v1 plugin with Airflow 2.3+, you must also set the environment variable `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN=true`. + +```shell +pip install 'acryl-datahub-airflow-plugin[plugin-v1]' + +# The DataHub rest connection type is included by default. +# To use the DataHub Kafka connection type, install the plugin with the kafka extras. +pip install 'acryl-datahub-airflow-plugin[plugin-v1,datahub-kafka]' +``` + + + +### Configuration + +#### Disable lazy plugin loading ```ini title="airflow.cfg" [core] lazy_load_plugins = False ``` -3. You must configure an Airflow hook for Datahub. We support both a Datahub REST hook and a Kafka-based hook, but you only need one. +On MWAA you should add this config to your [Apache Airflow configuration options](https://docs.aws.amazon.com/mwaa/latest/userguide/configuring-env-variables.html#configuring-2.0-airflow-override). + +#### Setup a DataHub connection - ```shell - # For REST-based: - airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080' --conn-password '' - # For Kafka-based (standard Kafka sink config can be passed via extras): - airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' - ``` +You must configure an Airflow connection for Datahub. We support both a Datahub REST and a Kafka-based connections, but you only need one. -4. Add your `datahub_conn_id` and/or `cluster` to your `airflow.cfg` file if it is not align with the default values. See configuration parameters below +```shell +# For REST-based: +airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080' --conn-password '' +# For Kafka-based (standard Kafka sink config can be passed via extras): +airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' +``` - **Configuration options:** +#### Configure the plugin - | Name | Default value | Description | - | ------------------------------ | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | - | datahub.enabled | true | If the plugin should be enabled. | - | datahub.conn_id | datahub_rest_default | The name of the datahub connection you set in step 1. | - | datahub.cluster | prod | name of the airflow cluster | - | datahub.capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. | - | datahub.capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. | - | datahub.capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. | - | datahub.graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. | +If your config doesn't align with the default values, you can configure the plugin in your `airflow.cfg` file. + +```ini title="airflow.cfg" +[datahub] +enabled = true +conn_id = datahub_rest_default # or datahub_kafka_default +# etc. +``` -5. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). -6. [optional] Learn more about [Airflow lineage](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html), including shorthand notation and some automation. +| Name | Default value | Description | +| ---------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| enabled | true | If the plugin should be enabled. | +| conn_id | datahub_rest_default | The name of the datahub connection you set in step 1. | +| cluster | prod | name of the airflow cluster | +| capture_ownership_info | true | If true, the owners field of the DAG will be capture as a DataHub corpuser. | +| capture_tags_info | true | If true, the tags field of the DAG will be captured as DataHub tags. | +| capture_executions | true | If true, we'll capture task runs in DataHub in addition to DAG definitions. | +| graceful_exceptions | true | If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. | -### How to validate installation +#### Validate that the plugin is working 1. Go and check in Airflow at Admin -> Plugins menu if you can see the DataHub plugin 2. Run an Airflow DAG. In the task logs, you should see Datahub related log messages like: @@ -77,9 +140,22 @@ lazy_load_plugins = False Emitting DataHub ... ``` -### Emitting lineage via a custom operator to the Airflow Plugin +## Manual Lineage Annotation + +### Using `inlets` and `outlets` + +You can manually annotate lineage by setting `inlets` and `outlets` on your Airflow operators. This is useful if you're using an operator that doesn't support automatic lineage extraction, or if you want to override the automatic lineage extraction. + +We have a few code samples that demonstrate how to use `inlets` and `outlets`: -If you have created a custom Airflow operator [docs](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class, +- [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py) +- [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) - uses the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html) + +For more information, take a look at the [Airflow lineage docs](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html). + +### Custom Operators + +If you have created a [custom Airflow operator](https://airflow.apache.org/docs/apache-airflow/stable/howto/custom-operator.html) that inherits from the BaseOperator class, when overriding the `execute` function, set inlets and outlets via `context['ti'].task.inlets` and `context['ti'].task.outlets`. The DataHub Airflow plugin will then pick up those inlets and outlets after the task runs. @@ -90,7 +166,7 @@ class DbtOperator(BaseOperator): def execute(self, context): # do something inlets, outlets = self._get_lineage() - # inlets/outlets are lists of either datahub_provider.entities.Dataset or datahub_provider.entities.Urn + # inlets/outlets are lists of either datahub_airflow_plugin.entities.Dataset or datahub_airflow_plugin.entities.Urn context['ti'].task.inlets = self.inlets context['ti'].task.outlets = self.outlets @@ -100,78 +176,25 @@ class DbtOperator(BaseOperator): return inlets, outlets ``` -If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. [source](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage) - -## Using DataHub's Airflow lineage backend (deprecated) - -:::caution - -The DataHub Airflow plugin (above) is the recommended way to integrate Airflow with DataHub. For managed services like MWAA, the lineage backend is not supported and so you must use the Airflow plugin. - -If you're using Airflow 1.x, we recommend using the Airflow lineage backend with acryl-datahub <= 0.9.1.0. - -::: - -:::note - -If you are looking to run Airflow and DataHub using docker locally, follow the guide [here](../../docker/airflow/local_airflow.md). Otherwise proceed to follow the instructions below. -::: - -### Setting up Airflow to use DataHub as Lineage Backend - -1. You need to install the required dependency in your airflow. See - -```shell -pip install acryl-datahub[airflow] -# If you need the Kafka-based emitter/hook: -pip install acryl-datahub[airflow,datahub-kafka] -``` - -2. You must configure an Airflow hook for Datahub. We support both a Datahub REST hook and a Kafka-based hook, but you only need one. - - ```shell - # For REST-based: - airflow connections add --conn-type 'datahub_rest' 'datahub_rest_default' --conn-host 'http://datahub-gms:8080' --conn-password '' - # For Kafka-based (standard Kafka sink config can be passed via extras): - airflow connections add --conn-type 'datahub_kafka' 'datahub_kafka_default' --conn-host 'broker:9092' --conn-extra '{}' - ``` +If you override the `pre_execute` and `post_execute` function, ensure they include the `@prepare_lineage` and `@apply_lineage` decorators respectively. Reference the [Airflow docs](https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/lineage.html#lineage) for more details. -3. Add the following lines to your `airflow.cfg` file. +## Emit Lineage Directly - ```ini title="airflow.cfg" - [lineage] - backend = datahub_provider.lineage.datahub.DatahubLineageBackend - datahub_kwargs = { - "enabled": true, - "datahub_conn_id": "datahub_rest_default", - "cluster": "prod", - "capture_ownership_info": true, - "capture_tags_info": true, - "graceful_exceptions": true } - # The above indentation is important! - ``` +If you can't use the plugin or annotate inlets/outlets, you can also emit lineage using the `DatahubEmitterOperator`. - **Configuration options:** +Reference [`lineage_emission_dag.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py) for a full example. - - `datahub_conn_id` (required): Usually `datahub_rest_default` or `datahub_kafka_default`, depending on what you named the connection in step 1. - - `cluster` (defaults to "prod"): The "cluster" to associate Airflow DAGs and tasks with. - - `capture_ownership_info` (defaults to true): If true, the owners field of the DAG will be capture as a DataHub corpuser. - - `capture_tags_info` (defaults to true): If true, the tags field of the DAG will be captured as DataHub tags. - - `capture_executions` (defaults to false): If true, it captures task runs as DataHub DataProcessInstances. - - `graceful_exceptions` (defaults to true): If set to true, most runtime errors in the lineage backend will be suppressed and will not cause the overall task to fail. Note that configuration issues will still throw exceptions. +In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See the plugin configuration for examples. -4. Configure `inlets` and `outlets` for your Airflow operators. For reference, look at the sample DAG in [`lineage_backend_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_demo.py), or reference [`lineage_backend_taskflow_demo.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_backend_taskflow_demo.py) if you're using the [TaskFlow API](https://airflow.apache.org/docs/apache-airflow/stable/concepts/taskflow.html). -5. [optional] Learn more about [Airflow lineage](https://airflow.apache.org/docs/apache-airflow/stable/lineage.html), including shorthand notation and some automation. - -## Emitting lineage via a separate operator - -Take a look at this sample DAG: +## Debugging -- [`lineage_emission_dag.py`](../../metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py) - emits lineage using the DatahubEmitterOperator. +### Missing lineage -In order to use this example, you must first configure the Datahub hook. Like in ingestion, we support a Datahub REST hook and a Kafka-based hook. See step 1 above for details. +If you're not seeing lineage in DataHub, check the following: -## Debugging +- Validate that the plugin is loaded in Airflow. Go to Admin -> Plugins and check that the DataHub plugin is listed. +- If using the v2 plugin's automatic lineage, ensure that the `enable_extractors` config is set to true and that automatic lineage is supported for your operator. +- If using manual lineage annotation, ensure that you're using the `datahub_airflow_plugin.entities.Dataset` or `datahub_airflow_plugin.entities.Urn` classes for your inlets and outlets. ### Incorrect URLs @@ -179,9 +202,21 @@ If your URLs aren't being generated correctly (usually they'll start with `http: ```ini title="airflow.cfg" [webserver] -base_url = http://airflow.example.com +base_url = http://airflow.mycorp.example.com ``` +## Compatibility + +We no longer officially support Airflow <2.1. However, you can use older versions of `acryl-datahub-airflow-plugin` with older versions of Airflow. +Both of these options support Python 3.7+. + +- Airflow 1.10.x, use DataHub plugin v1 with acryl-datahub-airflow-plugin <= 0.9.1.0. +- Airflow 2.0.x, use DataHub plugin v1 with acryl-datahub-airflow-plugin <= 0.11.0.1. + +DataHub also previously supported an Airflow [lineage backend](https://airflow.apache.org/docs/apache-airflow/2.2.0/lineage.html#lineage-backend) implementation. While the implementation is still in our codebase, it is deprecated and will be removed in a future release. +Note that the lineage backend did not support automatic lineage extraction, did not capture task failures, and did not work in AWS MWAA. +The [documentation for the lineage backend](https://docs-website-1wmaehubl-acryldata.vercel.app/docs/lineage/airflow/#using-datahubs-airflow-lineage-backend-deprecated) has already been archived. + ## Additional references Related Datahub videos: diff --git a/docs/managed-datahub/chrome-extension.md b/docs/managed-datahub/chrome-extension.md index 0aa0860d03b67a..a4560bc8cc09ba 100644 --- a/docs/managed-datahub/chrome-extension.md +++ b/docs/managed-datahub/chrome-extension.md @@ -1,10 +1,8 @@ --- description: Learn how to upload and use the Acryl DataHub Chrome extension (beta) locally before it's available on the Chrome store. --- -import FeatureAvailability from '@site/src/components/FeatureAvailability'; # Acryl DataHub Chrome Extension - ## Installing the Extension diff --git a/docs/managed-datahub/observe/column-assertions.md b/docs/managed-datahub/observe/column-assertions.md new file mode 100644 index 00000000000000..99a764f7716766 --- /dev/null +++ b/docs/managed-datahub/observe/column-assertions.md @@ -0,0 +1,358 @@ +--- +description: This page provides an overview of working with DataHub Column Assertions +--- +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + + +# Column Assertions + + + +> ⚠️ The **Column Assertions** feature is currently in private beta, part of the **Acryl Observe** module, and may only +> be available to a limited set of design partners. +> +> If you are interested in trying it and providing feedback, please reach out to your Acryl Customer Success +> representative. + +## Introduction + +Can you remember a time when an important warehouse table column changed dramatically, with little or no notice? Perhaps the number of null values suddenly spiked, or a new value was added to a fixed set of possible values. If the answer is yes, how did you initially find out? We'll take a guess - someone looking at an internal reporting dashboard or worse, a user using your your product, sounded an alarm when a number looked a bit out of the ordinary. + +There are many reasons why important columns in your Snowflake, Redshift, or BigQuery tables may change - application code bugs, new feature rollouts, etc. Oftentimes, these changes break important assumptions made about the data used in building key downstream data products like reporting dashboards or data-driven product features. + +What if you could reduce the time to detect these incidents, so that the people responsible for the data were made aware of data issues before anyone else? With Acryl DataHub Column Assertions, you can. + +With Acryl DataHub, you can define **Column Value** assertions to ensure each value in a column matches specific constraints, and **Column Metric** assertions to ensure that computed metrics from columns align with your expectations. As soon as things go wrong, your team will be the first to know, before the data issue becomes a larger data incident. + +In this guide, we'll cover the basics of Column Assertions - what they are, how to configure them, and more - so that you and your team can start building trust in your most important data assets. + +Let's dive in! + +## Support + +Column Assertions are currently supported for: + +1. Snowflake +2. Redshift +3. BigQuery + +Note that an Ingestion Source _must_ be configured with the data platform of your choice in +Acryl DataHub's **Ingestion** tab. + +> Note that Column Assertions are not yet supported if you are connecting to your warehouse +> using the DataHub CLI or a Remote Ingestion Executor. + +## What is a Column Assertion? + +A **Column Assertion** is a highly configurable Data Quality rule used to monitor specific columns of a Data Warehouse table for unexpected changes. + +Column Assertions are defined to validate a specific column, and can be used to + +1. Validate that the values of the column match some constraints (regex, allowed values, max, min, etc) across rows OR +2. Validate that specific column aggregation metrics match some expectations across rows. + +Column Assertions can be particularly useful for documenting and enforcing column-level "contracts", i.e. formal specifications about the expected contents of a particular column that can be used for coordinating among producers and consumers of the data. + +### Anatomy of Column Assertion + +Column Assertions can be divided into two main types: **Column Value** and **Column Metric** Assertions. + +A **Column Value Assertion** is used to monitor the value of a specific column in a table, and ensure that every row +adheres to a specific condition. In comparison, a **Column Metric Assertion** is used to compute a metric for that column, +and ensure that the value of that metric adheres to a specific condition. + +At the most basic level, both types consist of a few important parts: + +1. An **Evaluation Schedule** +2. A **Column Selection** +3. A **Evaluation Criteria** +4. A **Row Evaluation Type** + +In this section, we'll give an overview of each. + +#### 1. Evaluation Schedule + +The **Evaluation Schedule**: This defines how often to evaluate the Column Assertion against the given warehouse table. +This should usually be configured to match the expected change frequency of the table, although it can also be less +frequently depending on your requirements. You can also specify specific days of the week, hours in the day, or even +minutes in an hour. + +#### 2. Column Selection + +The **Column Selection**: This defines the column that should be monitored by the Column Assertion. You can choose from +any of the columns from the table listed in the dropdown. Note that columns of struct / object type are not currently supported. + +#### 3. Evaluation Criteria + +The **Evaluation Criteria**: This defines the condition that must be satisfied in order for the Column +Assertion to pass. + +For **Column Value Assertions**, you will be able to choose from a set of operators that can be applied to the column +value. The options presented will vary based on the data type of the selected column. For example, if you've selected a numeric column, you +can verify that the column value is greater than a particular value. For string types, you can check that the column value +matches a particular regex pattern. Additionally, you are able to control the behavior of the check in the presence of NULL values. If the +**Allow Nulls** option is _disabled_, then any null values encountered will be reported as a failure when evaluating the +assertion. If **Allow Nulls** is enabled, then nulls will be ignored; the condition will be evaluated for rows where the column value is non-null. + +For **Column Metric Assertions**, you will be able to choose from a list of common column metrics - MAX, MIN, MEAN, NULL COUNT, etc - and then compare these metric values to an expected value. The list of metrics will vary based on the type of the selected column. For example +if you've selected a numeric column, you can choose to compute the MEAN value of the column, and then assert that it is greater than a +specific number. For string types, you can choose to compute the MAX LENGTH of the string across all column values, and then assert that it +is less than a specific number. + +#### 4. Row Selection Set + +The **Row Selection Set**: This defines which rows in the table the Column Assertion will be evaluated across. You can choose +from the following options: + +- **All Table Rows**: Evaluate the Column Assertion across all rows in the table. This is the default option. Note that +this may not be desirable for large tables. + +- **Only Rows That Have Changed**: Evaluate the Column Assertion only against rows that have changed since the last +evaluation of the assertion. If you choose this option, you will need to specify a **High Watermark Column** to help determine which rows +have changed. A **High Watermark Column** is a column that contains a constantly incrementing value - a date, a time, or +another always-increasing number - that can be used to find the "new rows" that were added since previous evaluation. When selected, a query will be issued to the table to find only the rows that have changed since the previous assertion evaluation. + +## Creating a Column Assertion + +### Prerequisites + +1. **Permissions**: To create or delete Column Assertions for a specific entity on DataHub, you'll need to be granted the + `Edit Assertions` and `Edit Monitors` privileges for the entity. This is granted to Entity owners by default. + +2. **Data Platform Connection**: In order to create a Column Assertion, you'll need to have an **Ingestion Source** + configured to your Data Platform: Snowflake, BigQuery, or Redshift under the **Ingestion** tab. + +Once these are in place, you're ready to create your Column Assertions! + +### Steps + +1. Navigate to the Table that you want to monitor +2. Click the **Validations** tab + +

+ +

+ +3. Click **+ Create Assertion** + +

+ +

+ +4. Choose **Column** + +5. Configure the evaluation **schedule**. This is the frequency at which the assertion will be evaluated to produce a + pass or fail result, and the times when the column values will be checked. + +6. Configure the **column assertion type**. You can choose from **Column Value** or **Column Metric**. + **Column Value** assertions are used to monitor the value of a specific column in a table, and ensure that every row + adheres to a specific condition. **Column Metric** assertions are used to compute a metric for that column, and then compare the value of that metric to your expectations. + +

+ +

+ +7. Configure the **column selection**. This defines the column that should be monitored by the Column Assertion. + You can choose from any of the columns from the table listed in the dropdown. + +

+ +

+ +8. Configure the **evaluation criteria**. This step varies based on the type of assertion you chose in the previous step. + + - **Column Value Assertions**: You will be able to choose from a set of operators that can be applied to the column + value. The options presented will vary based on the data type of the selected column. For example with numeric types, you + can check that the column value is greater than a specific value. For string types, you can check that the column value + matches a particular regex pattern. You will also be able to control the behavior of null values in the column. If the + **Allow Nulls** option is _disabled_, any null values encountered will be reported as a failure when evaluating the + assertion. + + - **Column Metric Assertions**: You will be able to choose from a list of common metrics and then specify the operator + and value to compare against. The list of metrics will vary based on the data type of the selected column. For example + with numeric types, you can choose to compute the average value of the column, and then assert that it is greater than a + specific number. For string types, you can choose to compute the max length of all column values, and then assert that it + is less than a specific number. + +9. Configure the **row evaluation type**. This defines which rows in the table the Column Assertion should evaluate. You can choose + from the following options: + + - **All Table Rows**: Evaluate the Column Assertion against all rows in the table. This is the default option. Note that + this may not be desirable for large tables. + + - **Only Rows That Have Changed**: Evaluate the Column Assertion only against rows that have changed since the last + evaluation. If you choose this option, you will need to specify a **High Watermark Column** to help determine which rows + have changed. A **High Watermark Column** is a column that contains a constantly-incrementing value - a date, a time, or + another always-increasing number. When selected, a query will be issued to the table find only the rows which have changed since the last assertion run. + +

+ +

+ +10. (Optional) Click **Advanced** to further customize the Column Assertion. The options listed here will vary based on the + type of assertion you chose in the previous step. + + - **Invalid Values Threshold**: For **Column Value** assertions, you can configure the number of invalid values + (i.e. rows) that are allowed to fail before the assertion is marked as failing. This is useful if you want to allow a limited number + of invalid values in the column. By default this is 0, meaning the assertion will fail if any rows have an invalid column value. + + - **Source**: For **Column Metric** assertions, you can choose the mechanism that will be used to obtain the column + metric. **Query** will issue a query to the dataset to compute the metric. **DataHub Dataset Profile** will use the + DataHub Dataset Profile metadata to compute the metric. Note that this option requires that dataset profiling + statistics are up-to-date as of the assertion run time. + + - **Additional Filters**: You can choose to add additional filters to the query that will be used to evaluate the + assertion. This is useful if you want to limit the assertion to a subset of rows in the table. Note this option will not + be available if you choose **DataHub Dataset Profile** as the **source**. + +11. Click **Next** +12. Configure actions that should be taken when the Column Assertion passes or fails + +

+ +

+ +- **Raise incident**: Automatically raise a new DataHub `Column` Incident for the Table whenever the Column Assertion is failing. This + may indicate that the Table is unfit for consumption. Configure Slack Notifications under **Settings** to be notified when + an incident is created due to an Assertion failure. +- **Resolve incident**: Automatically resolved any incidents that were raised due to failures in this Column Assertion. Note that + any other incidents will not be impacted. + +10. Click **Save**. + +And that's it! DataHub will now begin to monitor your Column Assertion for the table. + +To view the time of the next Column Assertion evaluation, simply click **Column** and then click on your +new Assertion: + +

+ +

+ +Once your assertion has run, you will begin to see Success or Failure status for the Table + +

+ +

+ +## Stopping a Column Assertion + +In order to temporarily stop the evaluation of a Column Assertion: + +1. Navigate to the **Validations** tab of the table with the assertion +2. Click **Column** to open the Column Assertions list +3. Click the three-dot menu on the right side of the assertion you want to disable +4. Click **Stop** + +

+ +

+ +To resume the Column Assertion, simply click **Turn On**. + +

+ +

+ +## Creating Column Assertions via API + +Under the hood, Acryl DataHub implements Column Assertion Monitoring using two "entity" concepts: + +- **Assertion**: The specific expectation for the column metric. e.g. "The value of an integer column is greater than 10 for all rows in the table." This is the "what". + +- **Monitor**: The process responsible for evaluating the Assertion on a given evaluation schedule and using specific + mechanisms. This is the "how". + +Note that to create or delete Assertions and Monitors for a specific entity on DataHub, you'll need the +`Edit Assertions` and `Edit Monitors` privileges for it. + +#### GraphQL + +In order to create a Column Assertion that is being monitored on a specific **Evaluation Schedule**, you'll need to use 2 +GraphQL mutation queries to create a Column Assertion entity and create an Assertion Monitor entity responsible for evaluating it. + +Start by creating the Column Assertion entity using the `createFieldAssertion` query and hang on to the 'urn' field of the Assertion entity +you get back. Then continue by creating a Monitor entity using the `createAssertionMonitor`. + +##### Examples + +To create a Column Assertion Entity that checks that the value of an integer column is greater than 10: + +```json +mutation createFieldAssertion { + createFieldAssertion( + input: { + entityUrn: "", + type: FIELD_VALUES, + fieldValuesAssertion: { + field: { + path: "", + type: "NUMBER", + nativeType: "NUMBER(38,0)" + }, + operator: GREATER_THAN, + parameters: { + value: { + type: NUMBER, + value: "10" + } + }, + failThreshold: { + type: COUNT, + value: 0 + }, + excludeNulls: true + } + } + ) { + urn +} +} +``` + +To create an Assertion Monitor Entity that evaluates the column assertion every 8 hours using all rows in the table: + +```json +mutation createAssertionMonitor { + createAssertionMonitor( + input: { + entityUrn: "", + assertionUrn: "", + schedule: { + cron: "0 */8 * * *", + timezone: "America/Los_Angeles" + }, + parameters: { + type: DATASET_FIELD, + datasetFieldParameters: { + sourceType: ALL_ROWS_QUERY + } + } + } + ) { + urn + } +} +``` + +This entity defines _when_ to run the check (Using CRON format - every 8th hour) and _how_ to run the check (using a query against all rows of the table). + +After creating the monitor, the new assertion will start to be evaluated every 8 hours in your selected timezone. + +You can delete assertions along with their monitors using GraphQL mutations: `deleteAssertion` and `deleteMonitor`. + +### Tips + +:::info +**Authorization** + +Remember to always provide a DataHub Personal Access Token when calling the GraphQL API. To do so, just add the 'Authorization' header as follows: + +``` +Authorization: Bearer +``` + +**Exploring GraphQL API** + +Also, remember that you can play with an interactive version of the Acryl GraphQL API at `https://your-account-id.acryl.io/api/graphiql` +::: diff --git a/docs/managed-datahub/release-notes/v_0_2_11.md b/docs/managed-datahub/release-notes/v_0_2_11.md index 1f420908487127..c99d10201e0977 100644 --- a/docs/managed-datahub/release-notes/v_0_2_11.md +++ b/docs/managed-datahub/release-notes/v_0_2_11.md @@ -7,7 +7,7 @@ Release Availability Date Recommended CLI/SDK --- -- `v0.11.0` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.10.5.5 +- `v0.11.0` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.11.0 - [Deprecation] In LDAP ingestor, the manager_pagination_enabled changed to general pagination_enabled If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better. diff --git a/docs/managed-datahub/release-notes/v_0_2_12.md b/docs/managed-datahub/release-notes/v_0_2_12.md new file mode 100644 index 00000000000000..b13f471d9bf63c --- /dev/null +++ b/docs/managed-datahub/release-notes/v_0_2_12.md @@ -0,0 +1,30 @@ +# v0.2.12 +--- + +Release Availability Date +--- +13-Oct-2023 + +Recommended CLI/SDK +--- +- `v0.11.0.4` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.11.0.4 +- [breaking] Removed support for SQLAlchemy 1.3.x. Only SQLAlchemy 1.4.x is supported now. +- [breaking] Removed `urn:li:corpuser:datahub` owner for the `Measure`, `Dimension` and `Temporal` tags emitted by Looker and LookML source connectors. +- [breaking] The Airflow plugin no longer supports Airflow 2.0.x or Python 3.7. +- [breaking] Introduced the Airflow plugin v2. If you're using Airflow 2.3+, the v2 plugin will be enabled by default, and so you'll need to switch your requirements to include `pip install 'acryl-datahub-airflow-plugin[plugin-v2]'`. To continue using the v1 plugin, set the `DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN` environment variable to `true`. +- [breaking] The Unity Catalog ingestion source has a new option `include_metastore`, which will cause all urns to be changed when disabled. +This is currently enabled by default to preserve compatibility, but will be disabled by default and then removed in the future. +If stateful ingestion is enabled, simply setting `include_metastore: false` will perform all required cleanup. +Otherwise, we recommend soft deleting all databricks data via the DataHub CLI: +`datahub delete --platform databricks --soft` and then reingesting with `include_metastore: false`. + + +If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better. + + +## Release Changelog +--- +- Since `v0.2.11` these changes from OSS DataHub https://github.com/datahub-project/datahub/compare/75252a3d9f6a576904be5a0790d644b9ae2df6ac...10a190470e8c932b6d34cba49de7dbcba687a088 have been pulled in. + +## Some notable features in this SaaS release +- Nested Domains available in this release diff --git a/docs/ui-ingestion.md b/docs/ui-ingestion.md index db2007e1e19a93..438ddd8823b7e7 100644 --- a/docs/ui-ingestion.md +++ b/docs/ui-ingestion.md @@ -1,5 +1,12 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; + +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + # Ingestion + + ## Introduction Starting in version `0.8.25`, DataHub supports creating, configuring, scheduling, & executing batch metadata ingestion using the DataHub user interface. This makes @@ -173,28 +180,29 @@ Finally, give your Ingestion Source a name. Once you're happy with your configurations, click 'Done' to save your changes. -##### Advanced: Running with a specific CLI version +##### Advanced ingestion configs: -DataHub comes pre-configured to use the latest version of the DataHub CLI ([acryl-datahub](https://pypi.org/project/acryl-datahub/)) that is compatible +DataHub's Managed Ingestion UI comes pre-configured to use the latest version of the DataHub CLI ([acryl-datahub](https://pypi.org/project/acryl-datahub/)) that is compatible with the server. However, you can override the default package version using the 'Advanced' source configurations. To do so, simply click 'Advanced', then change the 'CLI Version' text box to contain the exact version of the DataHub CLI you'd like to use. -

_Pinning the CLI version to version `0.8.23.2`_ +Other advanced options include specifying **environment variables**, **DataHub plugins** or **python packages at runtime**. + Once you're happy with your changes, simply click 'Done' to save.
You can upload and even update recipes using the cli as mentioned in the [cli documentation for uploading ingestion recipes](./cli.md#ingest-deploy). -An example execution would look something like: +An example execution for a given `recipe.yaml` file, would look something like: ```bash datahub ingest deploy --name "My Test Ingestion Source" --schedule "5 * * * *" --time-zone "UTC" -c recipe.yaml @@ -330,8 +338,8 @@ for the `datahub-actions` container and running `docker logs `. There are valid cases for ingesting metadata without the UI-based ingestion scheduler. For example, - You have written a custom ingestion Source -- Your data sources are not reachable on the network where DataHub is deployed -- Your ingestion source requires context from a local filesystem (e.g. input files, environment variables, etc) +- Your data sources are not reachable on the network where DataHub is deployed. Managed DataHub users can use a [remote executor](managed-datahub/operator-guide/setting-up-remote-ingestion-executor-on-aws.md) for remote UI-based ingestion. +- Your ingestion source requires context from a local filesystem (e.g. input files) - You want to distribute metadata ingestion among multiple producers / environments ### How do I attach policies to the actions pod to give it permissions to pull metadata from various sources? diff --git a/docs/what-is-datahub/datahub-concepts.md b/docs/what-is-datahub/datahub-concepts.md index 6328d97fa6a502..03b86fab0ede41 100644 --- a/docs/what-is-datahub/datahub-concepts.md +++ b/docs/what-is-datahub/datahub-concepts.md @@ -99,7 +99,7 @@ List of Data Platforms - Tableau - Vertica -Reference : [data_platforms.json](https://github.com/acryldata/datahub-fork/blob/acryl-main/metadata-service/war/src/main/resources/boot/data_platforms.json) +Reference : [data_platforms.json](https://github.com/datahub-project/datahub/blob/master/metadata-service/war/src/main/resources/boot/data_platforms.json) diff --git a/gradle.properties b/gradle.properties index 2b211e725359af..1cd349344b432d 100644 --- a/gradle.properties +++ b/gradle.properties @@ -6,9 +6,16 @@ org.gradle.caching=false # Increase gradle JVM memory to 3GB to allow tests to run locally org.gradle.jvmargs=-Xmx3000m # Increase retries to 5 (from default of 3) and increase interval from 125ms to 1s. +# Based on this thread https://github.com/gradle/gradle/issues/4629, it's unclear +# if we should be using systemProp or not. We're using both for now. org.gradle.internal.repository.max.retries=5 org.gradle.internal.repository.max.tentatives=5 org.gradle.internal.repository.initial.backoff=1000 +systemProp.org.gradle.internal.http.connectionTimeout=120000 +systemProp.org.gradle.internal.http.socketTimeout=120000 +systemProp.org.gradle.internal.repository.max.retries=5 +systemProp.org.gradle.internal.repository.max.tentatives=5 +systemProp.org.gradle.internal.repository.initial.backoff=1000 # Needed to publish to Nexus from a sub-module gnsp.disableApplyOnlyOnRootProjectEnforcement=true diff --git a/gradle/versioning/versioning.gradle b/gradle/versioning/versioning.gradle index 1fac894d165a84..39a8a3faf80117 100644 --- a/gradle/versioning/versioning.gradle +++ b/gradle/versioning/versioning.gradle @@ -21,7 +21,7 @@ Produces the following variables and supports token replacement import org.apache.tools.ant.filters.ReplaceTokens def detailedVersionString = "0.0.0-unknown-SNAPSHOT" -def cliMajorVersion = "0.10.5" // base default cli major version +def cliMajorVersion = "0.12.0" // base default cli major version def snapshotVersion = false if (project.hasProperty("releaseVersion")) { version = releaseVersion diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthUtil.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthUtil.java index dfb936c61ee0cd..e159993a8a2430 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthUtil.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthUtil.java @@ -11,7 +11,7 @@ public class AuthUtil { public static boolean isAuthorized( @Nonnull Authorizer authorizer, @Nonnull String actor, - @Nonnull Optional maybeResourceSpec, + @Nonnull Optional maybeResourceSpec, @Nonnull DisjunctivePrivilegeGroup privilegeGroup ) { for (ConjunctivePrivilegeGroup andPrivilegeGroup : privilegeGroup.getAuthorizedPrivilegeGroups()) { @@ -27,7 +27,7 @@ public static boolean isAuthorized( public static boolean isAuthorizedForResources( @Nonnull Authorizer authorizer, @Nonnull String actor, - @Nonnull List> resourceSpecs, + @Nonnull List> resourceSpecs, @Nonnull DisjunctivePrivilegeGroup privilegeGroup ) { for (ConjunctivePrivilegeGroup andPrivilegeGroup : privilegeGroup.getAuthorizedPrivilegeGroups()) { @@ -44,7 +44,7 @@ private static boolean isAuthorized( @Nonnull Authorizer authorizer, @Nonnull String actor, @Nonnull ConjunctivePrivilegeGroup requiredPrivileges, - @Nonnull Optional resourceSpec) { + @Nonnull Optional resourceSpec) { // Each privilege in a group _must_ all be true to permit the operation. for (final String privilege : requiredPrivileges.getRequiredPrivileges()) { // Create and evaluate an Authorization request. @@ -62,11 +62,11 @@ private static boolean isAuthorizedForResources( @Nonnull Authorizer authorizer, @Nonnull String actor, @Nonnull ConjunctivePrivilegeGroup requiredPrivileges, - @Nonnull List> resourceSpecs) { + @Nonnull List> resourceSpecs) { // Each privilege in a group _must_ all be true to permit the operation. for (final String privilege : requiredPrivileges.getRequiredPrivileges()) { // Create and evaluate an Authorization request. - for (Optional resourceSpec : resourceSpecs) { + for (Optional resourceSpec : resourceSpecs) { final AuthorizationRequest request = new AuthorizationRequest(actor, privilege, resourceSpec); final AuthorizationResult result = authorizer.authorize(request); if (AuthorizationResult.Type.DENY.equals(result.getType())) { diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizationRequest.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizationRequest.java index 084a4554955519..9e75de3cbf44d9 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizationRequest.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizationRequest.java @@ -21,5 +21,5 @@ public class AuthorizationRequest { * The resource that the user is requesting for, if applicable. If the privilege is a platform privilege * this optional will be empty. */ - Optional resourceSpec; + Optional resourceSpec; } diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizerContext.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizerContext.java index f9940d171d5d4f..b79a4fa20c7eae 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizerContext.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/AuthorizerContext.java @@ -18,9 +18,9 @@ public class AuthorizerContext { private final Map contextMap; /** - * A utility for resolving a {@link ResourceSpec} to resolved resource field values. + * A utility for resolving an {@link EntitySpec} to resolved entity field values. */ - private ResourceSpecResolver resourceSpecResolver; + private EntitySpecResolver entitySpecResolver; /** * diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java new file mode 100644 index 00000000000000..1258d958f20923 --- /dev/null +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntityFieldType.java @@ -0,0 +1,44 @@ +package com.datahub.authorization; + +/** + * List of entity field types to fetch for a given entity + */ +public enum EntityFieldType { + + /** + * Type of the entity (e.g. dataset, chart) + * @deprecated + */ + @Deprecated + RESOURCE_URN, + /** + * Urn of the entity + * @deprecated + */ + @Deprecated + RESOURCE_TYPE, + /** + * Type of the entity (e.g. dataset, chart) + */ + TYPE, + /** + * Urn of the entity + */ + URN, + /** + * Owners of the entity + */ + OWNER, + /** + * Domains of the entity + */ + DOMAIN, + /** + * Groups of which the entity (only applies to corpUser) is a member + */ + GROUP_MEMBERSHIP, + /** + * Data platform instance of resource + */ + DATA_PLATFORM_INSTANCE +} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpec.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpec.java new file mode 100644 index 00000000000000..656bec0f44fc22 --- /dev/null +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpec.java @@ -0,0 +1,23 @@ +package com.datahub.authorization; + +import javax.annotation.Nonnull; +import lombok.Value; + + +/** + * Details about the entities involved in the authorization process. It models the actor and the resource being acted + * upon. Resource types currently supported can be found inside of {@link com.linkedin.metadata.authorization.PoliciesConfig} + */ +@Value +public class EntitySpec { + /** + * The entity type. (dataset, chart, dashboard, corpGroup, etc). + */ + @Nonnull + String type; + /** + * The entity identity. Most often, this corresponds to the raw entity urn. (urn:li:corpGroup:groupId) + */ + @Nonnull + String entity; +} \ No newline at end of file diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpecResolver.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpecResolver.java new file mode 100644 index 00000000000000..67347fbf87a876 --- /dev/null +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/EntitySpecResolver.java @@ -0,0 +1,11 @@ +package com.datahub.authorization; + +/** + * An Entity Spec Resolver is responsible for resolving a {@link EntitySpec} to a {@link ResolvedEntitySpec}. + */ +public interface EntitySpecResolver { + /** + Resolve a {@link EntitySpec} to a resolved entity spec. + **/ + ResolvedEntitySpec resolve(EntitySpec entitySpec); +} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/FieldResolver.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/FieldResolver.java index 9318f5f8e7b96b..955a06fd54cb95 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/FieldResolver.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/FieldResolver.java @@ -33,9 +33,9 @@ public static FieldResolver getResolverFromValues(Set values) { /** * Helper function that returns FieldResolver given a fetchFieldValue function */ - public static FieldResolver getResolverFromFunction(ResourceSpec resourceSpec, - Function fetchFieldValue) { - return new FieldResolver(() -> CompletableFuture.supplyAsync(() -> fetchFieldValue.apply(resourceSpec))); + public static FieldResolver getResolverFromFunction(EntitySpec entitySpec, + Function fetchFieldValue) { + return new FieldResolver(() -> CompletableFuture.supplyAsync(() -> fetchFieldValue.apply(entitySpec))); } public static FieldValue emptyFieldValue() { diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedEntitySpec.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedEntitySpec.java new file mode 100644 index 00000000000000..7948766df57159 --- /dev/null +++ b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedEntitySpec.java @@ -0,0 +1,66 @@ +package com.datahub.authorization; + +import java.util.Collections; +import java.util.Map; +import java.util.Set; +import javax.annotation.Nullable; +import lombok.Getter; +import lombok.RequiredArgsConstructor; +import lombok.ToString; + + +/** + * Wrapper around authorization request with field resolvers for lazily fetching the field values for each field type + */ +@RequiredArgsConstructor +@ToString +public class ResolvedEntitySpec { + @Getter + private final EntitySpec spec; + private final Map fieldResolvers; + + public Set getFieldValues(EntityFieldType entityFieldType) { + if (!fieldResolvers.containsKey(entityFieldType)) { + return Collections.emptySet(); + } + return fieldResolvers.get(entityFieldType).getFieldValuesFuture().join().getValues(); + } + + /** + * Fetch the owners for an entity. + * @return a set of owner urns, or empty set if none exist. + */ + public Set getOwners() { + if (!fieldResolvers.containsKey(EntityFieldType.OWNER)) { + return Collections.emptySet(); + } + return fieldResolvers.get(EntityFieldType.OWNER).getFieldValuesFuture().join().getValues(); + } + + /** + * Fetch the platform instance for a Resolved Resource Spec + * @return a Platform Instance or null if one does not exist. + */ + @Nullable + public String getDataPlatformInstance() { + if (!fieldResolvers.containsKey(EntityFieldType.DATA_PLATFORM_INSTANCE)) { + return null; + } + Set dataPlatformInstance = fieldResolvers.get(EntityFieldType.DATA_PLATFORM_INSTANCE).getFieldValuesFuture().join().getValues(); + if (dataPlatformInstance.size() > 0) { + return dataPlatformInstance.stream().findFirst().get(); + } + return null; + } + + /** + * Fetch the group membership for an entity. + * @return a set of groups urns, or empty set if none exist. + */ + public Set getGroupMembership() { + if (!fieldResolvers.containsKey(EntityFieldType.GROUP_MEMBERSHIP)) { + return Collections.emptySet(); + } + return fieldResolvers.get(EntityFieldType.GROUP_MEMBERSHIP).getFieldValuesFuture().join().getValues(); + } +} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java deleted file mode 100644 index 53dd0be44f963d..00000000000000 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResolvedResourceSpec.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.datahub.authorization; - -import java.util.Collections; -import java.util.Map; -import java.util.Set; -import lombok.Getter; -import lombok.RequiredArgsConstructor; -import lombok.ToString; - - -/** - * Wrapper around authorization request with field resolvers for lazily fetching the field values for each field type - */ -@RequiredArgsConstructor -@ToString -public class ResolvedResourceSpec { - @Getter - private final ResourceSpec spec; - private final Map fieldResolvers; - - public Set getFieldValues(ResourceFieldType resourceFieldType) { - if (!fieldResolvers.containsKey(resourceFieldType)) { - return Collections.emptySet(); - } - return fieldResolvers.get(resourceFieldType).getFieldValuesFuture().join().getValues(); - } - - /** - * Fetch the owners for a resource. - * @return a set of owner urns, or empty set if none exist. - */ - public Set getOwners() { - if (!fieldResolvers.containsKey(ResourceFieldType.OWNER)) { - return Collections.emptySet(); - } - return fieldResolvers.get(ResourceFieldType.OWNER).getFieldValuesFuture().join().getValues(); - } -} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceFieldType.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceFieldType.java deleted file mode 100644 index ee54d2bfbba1da..00000000000000 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceFieldType.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.datahub.authorization; - -/** - * List of resource field types to fetch for a given resource - */ -public enum ResourceFieldType { - /** - * Type of resource (e.g. dataset, chart) - */ - RESOURCE_TYPE, - /** - * Urn of resource - */ - RESOURCE_URN, - /** - * Owners of resource - */ - OWNER, - /** - * Domains of resource - */ - DOMAIN -} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpec.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpec.java deleted file mode 100644 index c1bd53e31fe292..00000000000000 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpec.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.datahub.authorization; - -import javax.annotation.Nonnull; -import lombok.Value; - - -/** - * Details about a specific resource being acted upon. Resource types currently supported - * can be found inside of {@link com.linkedin.metadata.authorization.PoliciesConfig} - */ -@Value -public class ResourceSpec { - /** - * The resource type. Most often, this corresponds to the entity type. (dataset, chart, dashboard, corpGroup, etc). - */ - @Nonnull - String type; - /** - * The resource identity. Most often, this corresponds to the raw entity urn. (urn:li:corpGroup:groupId) - */ - @Nonnull - String resource; -} \ No newline at end of file diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpecResolver.java b/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpecResolver.java deleted file mode 100644 index 05c35f377b9a90..00000000000000 --- a/metadata-auth/auth-api/src/main/java/com/datahub/authorization/ResourceSpecResolver.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.datahub.authorization; - -/** - * A Resource Spec Resolver is responsible for resolving a {@link ResourceSpec} to a {@link ResolvedResourceSpec}. - */ -public interface ResourceSpecResolver { - /** - Resolve a {@link ResourceSpec} to a resolved resource spec. - **/ - ResolvedResourceSpec resolve(ResourceSpec resourceSpec); -} diff --git a/metadata-auth/auth-api/src/main/java/com/datahub/plugins/auth/authorization/Authorizer.java b/metadata-auth/auth-api/src/main/java/com/datahub/plugins/auth/authorization/Authorizer.java index ce7a3f22b31471..c731a3ec987c1a 100644 --- a/metadata-auth/auth-api/src/main/java/com/datahub/plugins/auth/authorization/Authorizer.java +++ b/metadata-auth/auth-api/src/main/java/com/datahub/plugins/auth/authorization/Authorizer.java @@ -4,7 +4,7 @@ import com.datahub.authorization.AuthorizationResult; import com.datahub.authorization.AuthorizedActors; import com.datahub.authorization.AuthorizerContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.Plugin; import java.util.Map; import java.util.Optional; @@ -32,5 +32,5 @@ public interface Authorizer extends Plugin { * Retrieves the current list of actors authorized to for a particular privilege against * an optional resource */ - AuthorizedActors authorizedActors(final String privilege, final Optional resourceSpec); + AuthorizedActors authorizedActors(final String privilege, final Optional resourceSpec); } diff --git a/metadata-dao-impl/kafka-producer/build.gradle b/metadata-dao-impl/kafka-producer/build.gradle index 393b10b0e9d246..bc3415b2ccc8c1 100644 --- a/metadata-dao-impl/kafka-producer/build.gradle +++ b/metadata-dao-impl/kafka-producer/build.gradle @@ -1,9 +1,9 @@ apply plugin: 'java' dependencies { - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation project(':entity-registry') implementation project(':metadata-io') diff --git a/metadata-events/mxe-avro-1.7/.gitignore b/metadata-events/mxe-avro/.gitignore similarity index 100% rename from metadata-events/mxe-avro-1.7/.gitignore rename to metadata-events/mxe-avro/.gitignore diff --git a/metadata-events/mxe-avro-1.7/build.gradle b/metadata-events/mxe-avro/build.gradle similarity index 81% rename from metadata-events/mxe-avro-1.7/build.gradle rename to metadata-events/mxe-avro/build.gradle index 8c0a26d22dc7d2..9d11eeb160ff0f 100644 --- a/metadata-events/mxe-avro-1.7/build.gradle +++ b/metadata-events/mxe-avro/build.gradle @@ -6,8 +6,8 @@ apply plugin: 'io.acryl.gradle.plugin.avro' apply plugin: 'java-library' dependencies { - api externalDependency.avro_1_7 - implementation(externalDependency.avroCompiler_1_7) { + api externalDependency.avro + implementation(externalDependency.avroCompiler) { exclude group: 'org.apache.velocity', module: 'velocity' } constraints { @@ -21,7 +21,7 @@ dependencies { def genDir = file("src/generated/java") -task avroCodeGen(type: com.commercehub.gradle.plugin.avro.GenerateAvroJavaTask, dependsOn: configurations.avsc) { +task avroCodeGen(type: com.github.davidmc24.gradle.plugin.avro.GenerateAvroJavaTask, dependsOn: configurations.avsc) { source("$rootDir/metadata-events/mxe-schemas/src/renamed/avro") outputDir = genDir dependsOn(':metadata-events:mxe-schemas:renameNamespace') diff --git a/metadata-events/mxe-registration/build.gradle b/metadata-events/mxe-registration/build.gradle index 60e0da59616d93..032870d93329ff 100644 --- a/metadata-events/mxe-registration/build.gradle +++ b/metadata-events/mxe-registration/build.gradle @@ -5,7 +5,7 @@ configurations { } dependencies { - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-models') implementation spec.product.pegasus.dataAvro1_6 diff --git a/metadata-events/mxe-schemas/build.gradle b/metadata-events/mxe-schemas/build.gradle index fe46601fb68b79..8dc8b71bd1cd83 100644 --- a/metadata-events/mxe-schemas/build.gradle +++ b/metadata-events/mxe-schemas/build.gradle @@ -1,4 +1,4 @@ -apply plugin: 'java' +apply plugin: 'java-library' apply plugin: 'pegasus' dependencies { diff --git a/metadata-events/mxe-utils-avro-1.7/.gitignore b/metadata-events/mxe-utils-avro/.gitignore similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/.gitignore rename to metadata-events/mxe-utils-avro/.gitignore diff --git a/metadata-events/mxe-utils-avro-1.7/build.gradle b/metadata-events/mxe-utils-avro/build.gradle similarity index 95% rename from metadata-events/mxe-utils-avro-1.7/build.gradle rename to metadata-events/mxe-utils-avro/build.gradle index 3b137965d6c19f..a7bf287ab224d3 100644 --- a/metadata-events/mxe-utils-avro-1.7/build.gradle +++ b/metadata-events/mxe-utils-avro/build.gradle @@ -1,7 +1,7 @@ apply plugin: 'java-library' dependencies { - api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-events:mxe-avro') api project(':metadata-models') api spec.product.pegasus.dataAvro1_6 diff --git a/metadata-events/mxe-utils-avro-1.7/src/main/java/com/linkedin/metadata/EventUtils.java b/metadata-events/mxe-utils-avro/src/main/java/com/linkedin/metadata/EventUtils.java similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/main/java/com/linkedin/metadata/EventUtils.java rename to metadata-events/mxe-utils-avro/src/main/java/com/linkedin/metadata/EventUtils.java diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/java/com/linkedin/metadata/EventUtilsTests.java b/metadata-events/mxe-utils-avro/src/test/java/com/linkedin/metadata/EventUtilsTests.java similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/java/com/linkedin/metadata/EventUtilsTests.java rename to metadata-events/mxe-utils-avro/src/test/java/com/linkedin/metadata/EventUtilsTests.java diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-avro2pegasus-mae.json b/metadata-events/mxe-utils-avro/src/test/resources/test-avro2pegasus-mae.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-avro2pegasus-mae.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-avro2pegasus-mae.json diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-avro2pegasus-mce.json b/metadata-events/mxe-utils-avro/src/test/resources/test-avro2pegasus-mce.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-avro2pegasus-mce.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-avro2pegasus-mce.json diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-fmce.json b/metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-fmce.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-fmce.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-fmce.json diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-mae.json b/metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-mae.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-mae.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-mae.json diff --git a/metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-mce.json b/metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-mce.json similarity index 100% rename from metadata-events/mxe-utils-avro-1.7/src/test/resources/test-pegasus2avro-mce.json rename to metadata-events/mxe-utils-avro/src/test/resources/test-pegasus2avro-mce.json diff --git a/metadata-ingestion-modules/airflow-plugin/build.gradle b/metadata-ingestion-modules/airflow-plugin/build.gradle index 58a2bc9e670e34..dacf12dc020df4 100644 --- a/metadata-ingestion-modules/airflow-plugin/build.gradle +++ b/metadata-ingestion-modules/airflow-plugin/build.gradle @@ -10,6 +10,13 @@ ext { if (!project.hasProperty("extra_pip_requirements")) { ext.extra_pip_requirements = "" } +if (!project.hasProperty("extra_pip_extras")) { + ext.extra_pip_extras = "plugin-v2" +} +// If extra_pip_extras is non-empty, we need to add a comma to the beginning of the string. +if (extra_pip_extras != "") { + ext.extra_pip_extras = "," + extra_pip_extras +} def pip_install_command = "${venv_name}/bin/pip install -e ../../metadata-ingestion" @@ -36,7 +43,7 @@ task installPackage(type: Exec, dependsOn: [environmentSetup, ':metadata-ingesti // and https://github.com/datahub-project/datahub/pull/8435. commandLine 'bash', '-x', '-c', "${pip_install_command} install 'Cython<3.0' 'PyYAML<6' --no-build-isolation && " + - "${pip_install_command} -e . ${extra_pip_requirements} &&" + + "${pip_install_command} -e .[ignore${extra_pip_extras}] ${extra_pip_requirements} &&" + "touch ${sentinel_file}" } @@ -47,7 +54,7 @@ task installDev(type: Exec, dependsOn: [install]) { inputs.file file('setup.py') outputs.file("${sentinel_file}") commandLine 'bash', '-x', '-c', - "${pip_install_command} -e .[dev] ${extra_pip_requirements} && " + + "${pip_install_command} -e .[dev${extra_pip_extras}] ${extra_pip_requirements} && " + "touch ${sentinel_file}" } @@ -79,7 +86,8 @@ task installDevTest(type: Exec, dependsOn: [installDev]) { outputs.dir("${venv_name}") outputs.file("${sentinel_file}") commandLine 'bash', '-x', '-c', - "${pip_install_command} -e .[dev,integration-tests] && touch ${sentinel_file}" + "${pip_install_command} -e .[dev,integration-tests${extra_pip_extras}] ${extra_pip_requirements} && " + + "touch ${sentinel_file}" } def testFile = hasProperty('testFile') ? testFile : 'unknown' @@ -97,20 +105,13 @@ task testSingle(dependsOn: [installDevTest]) { } task testQuick(type: Exec, dependsOn: installDevTest) { - // We can't enforce the coverage requirements if we run a subset of the tests. inputs.files(project.fileTree(dir: "src/", include: "**/*.py")) inputs.files(project.fileTree(dir: "tests/")) - outputs.dir("${venv_name}") commandLine 'bash', '-x', '-c', - "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" + "source ${venv_name}/bin/activate && pytest -vv --continue-on-collection-errors --junit-xml=junit.quick.xml" } -task testFull(type: Exec, dependsOn: [testQuick, installDevTest]) { - commandLine 'bash', '-x', '-c', - "source ${venv_name}/bin/activate && pytest -m 'not slow_integration' -vv --continue-on-collection-errors --junit-xml=junit.full.xml" -} - task cleanPythonCache(type: Exec) { commandLine 'bash', '-c', "find src -type f -name '*.py[co]' -delete -o -type d -name __pycache__ -delete -o -type d -empty -delete" diff --git a/metadata-ingestion-modules/airflow-plugin/pyproject.toml b/metadata-ingestion-modules/airflow-plugin/pyproject.toml index fba81486b9f677..648040c1951db8 100644 --- a/metadata-ingestion-modules/airflow-plugin/pyproject.toml +++ b/metadata-ingestion-modules/airflow-plugin/pyproject.toml @@ -12,6 +12,7 @@ include = '\.pyi?$' [tool.isort] indent = ' ' +known_future_library = ['__future__', 'datahub.utilities._markupsafe_compat', 'datahub_provider._airflow_compat'] profile = 'black' sections = 'FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER' diff --git a/metadata-ingestion-modules/airflow-plugin/setup.cfg b/metadata-ingestion-modules/airflow-plugin/setup.cfg index 157bcce1c298d2..c25256c5751b8d 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.cfg +++ b/metadata-ingestion-modules/airflow-plugin/setup.cfg @@ -41,29 +41,29 @@ ignore_missing_imports = no [tool:pytest] asyncio_mode = auto -addopts = --cov=src --cov-report term-missing --cov-config setup.cfg --strict-markers +addopts = --cov=src --cov-report='' --cov-config setup.cfg --strict-markers -s -v +markers = + integration: marks tests to only run in integration (deselect with '-m "not integration"') testpaths = tests/unit tests/integration -[coverage:run] -# Because of some quirks in the way setup.cfg, coverage.py, pytest-cov, -# and tox interact, we should not uncomment the following line. -# See https://pytest-cov.readthedocs.io/en/latest/config.html and -# https://coverage.readthedocs.io/en/coverage-5.0/config.html. -# We also have some additional pytest/cov config options in tox.ini. -# source = src +# [coverage:run] +# # Because of some quirks in the way setup.cfg, coverage.py, pytest-cov, +# # and tox interact, we should not uncomment the following line. +# # See https://pytest-cov.readthedocs.io/en/latest/config.html and +# # https://coverage.readthedocs.io/en/coverage-5.0/config.html. +# # We also have some additional pytest/cov config options in tox.ini. +# # source = src -[coverage:paths] -# This is necessary for tox-based coverage to be counted properly. -source = - src - */site-packages +# [coverage:paths] +# # This is necessary for tox-based coverage to be counted properly. +# source = +# src +# */site-packages [coverage:report] -# The fail_under value ensures that at least some coverage data is collected. -# We override its value in the tox config. show_missing = true exclude_lines = pragma: no cover diff --git a/metadata-ingestion-modules/airflow-plugin/setup.py b/metadata-ingestion-modules/airflow-plugin/setup.py index 47069f59c314d9..a5af881022d8c9 100644 --- a/metadata-ingestion-modules/airflow-plugin/setup.py +++ b/metadata-ingestion-modules/airflow-plugin/setup.py @@ -1,5 +1,6 @@ import os import pathlib +from typing import Dict, Set import setuptools @@ -13,23 +14,43 @@ def get_long_description(): return pathlib.Path(os.path.join(root, "README.md")).read_text() +_version = package_metadata["__version__"] +_self_pin = f"=={_version}" if not _version.endswith("dev0") else "" + + rest_common = {"requests", "requests_file"} base_requirements = { # Compatibility. "dataclasses>=0.6; python_version < '3.7'", - # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to Airflow 2.0.2 dependency conflict - "typing_extensions>=3.7.4.3 ; python_version < '3.8'", - "typing_extensions>=3.10.0.2,<4.6.0 ; python_version >= '3.8'", "mypy_extensions>=0.4.3", # Actual dependencies. - "typing-inspect", "pydantic>=1.5.1", "apache-airflow >= 2.0.2", *rest_common, - f"acryl-datahub == {package_metadata['__version__']}", } +plugins: Dict[str, Set[str]] = { + "datahub-rest": { + f"acryl-datahub[datahub-rest]{_self_pin}", + }, + "datahub-kafka": { + f"acryl-datahub[datahub-kafka]{_self_pin}", + }, + "datahub-file": { + f"acryl-datahub[sync-file-emitter]{_self_pin}", + }, + "plugin-v1": set(), + "plugin-v2": { + # The v2 plugin requires Python 3.8+. + f"acryl-datahub[sql-parser]{_self_pin}", + "openlineage-airflow==1.2.0; python_version >= '3.8'", + }, +} + +# Include datahub-rest in the base requirements. +base_requirements.update(plugins["datahub-rest"]) + mypy_stubs = { "types-dataclasses", @@ -45,11 +66,9 @@ def get_long_description(): # versions 0.1.13 and 0.1.14 seem to have issues "types-click==0.1.12", "types-tabulate", - # avrogen package requires this - "types-pytz", } -base_dev_requirements = { +dev_requirements = { *base_requirements, *mypy_stubs, "black==22.12.0", @@ -66,6 +85,7 @@ def get_long_description(): "pytest-cov>=2.8.1", "tox", "deepdiff", + "tenacity", "requests-mock", "freezegun", "jsonpickle", @@ -74,8 +94,24 @@ def get_long_description(): "packaging", } -dev_requirements = { - *base_dev_requirements, +integration_test_requirements = { + *dev_requirements, + *plugins["datahub-file"], + *plugins["datahub-kafka"], + f"acryl-datahub[testing-utils]{_self_pin}", + # Extra requirements for loading our test dags. + "apache-airflow[snowflake]>=2.0.2", + # https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350 + # Eventually we want to set this to "snowflake-sqlalchemy>=1.4.3". + # However, that doesn't work with older versions of Airflow. Instead + # of splitting this into integration-test-old and integration-test-new, + # adding a bound to SQLAlchemy was the simplest solution. + "sqlalchemy<1.4.42", + # To avoid https://github.com/snowflakedb/snowflake-connector-python/issues/1188, + # we need https://github.com/snowflakedb/snowflake-connector-python/pull/1193 + "snowflake-connector-python>=2.7.10", + "virtualenv", # needed by PythonVirtualenvOperator + "apache-airflow-providers-sqlite", } @@ -88,7 +124,7 @@ def get_long_description(): setuptools.setup( # Package metadata. name=package_metadata["__package_name__"], - version=package_metadata["__version__"], + version=_version, url="https://datahubproject.io/", project_urls={ "Documentation": "https://datahubproject.io/docs/", @@ -131,17 +167,8 @@ def get_long_description(): # Dependencies. install_requires=list(base_requirements), extras_require={ + **{plugin: list(dependencies) for plugin, dependencies in plugins.items()}, "dev": list(dev_requirements), - "datahub-kafka": [ - f"acryl-datahub[datahub-kafka] == {package_metadata['__version__']}" - ], - "integration-tests": [ - f"acryl-datahub[datahub-kafka] == {package_metadata['__version__']}", - # Extra requirements for Airflow. - "apache-airflow[snowflake]>=2.0.2", # snowflake is used in example dags - # Because of https://github.com/snowflakedb/snowflake-sqlalchemy/issues/350 we need to restrict SQLAlchemy's max version. - "SQLAlchemy<1.4.42", - "virtualenv", # needed by PythonVirtualenvOperator - ], + "integration-tests": list(integration_test_requirements), }, ) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py index 5ad20e1f72551c..10f014fbd586f5 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_airflow_shims.py @@ -1,3 +1,7 @@ +from typing import List + +import airflow.version +import packaging.version from airflow.models.baseoperator import BaseOperator from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED @@ -21,7 +25,35 @@ assert AIRFLOW_PATCHED +# Approach suggested by https://stackoverflow.com/a/11887885/5004662. +AIRFLOW_VERSION = packaging.version.parse(airflow.version.version) +HAS_AIRFLOW_STANDALONE_CMD = AIRFLOW_VERSION >= packaging.version.parse("2.2.0.dev0") +HAS_AIRFLOW_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.3.0.dev0") +HAS_AIRFLOW_DAG_LISTENER_API = AIRFLOW_VERSION >= packaging.version.parse("2.5.0.dev0") + + +def get_task_inlets(operator: "Operator") -> List: + # From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets + if hasattr(operator, "_inlets"): + return operator._inlets # type: ignore[attr-defined, union-attr] + if hasattr(operator, "get_inlet_defs"): + return operator.get_inlet_defs() # type: ignore[attr-defined] + return operator.inlets + + +def get_task_outlets(operator: "Operator") -> List: + # From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets + # We have to use _outlets because outlets is empty in Airflow < 2.4.0 + if hasattr(operator, "_outlets"): + return operator._outlets # type: ignore[attr-defined, union-attr] + if hasattr(operator, "get_outlet_defs"): + return operator.get_outlet_defs() + return operator.outlets + + __all__ = [ + "AIRFLOW_VERSION", + "HAS_AIRFLOW_LISTENER_API", "Operator", "MappedOperator", "EmptyOperator", diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py new file mode 100644 index 00000000000000..67843da2ba995d --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_config.py @@ -0,0 +1,80 @@ +from typing import TYPE_CHECKING, Optional + +import datahub.emitter.mce_builder as builder +from airflow.configuration import conf +from datahub.configuration.common import ConfigModel + +if TYPE_CHECKING: + from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook + + +class DatahubLineageConfig(ConfigModel): + # This class is shared between the lineage backend and the Airflow plugin. + # The defaults listed here are only relevant for the lineage backend. + # The Airflow plugin's default values come from the fallback values in + # the get_lineage_config() function below. + + enabled: bool = True + + # DataHub hook connection ID. + datahub_conn_id: str + + # Cluster to associate with the pipelines and tasks. Defaults to "prod". + cluster: str = builder.DEFAULT_FLOW_CLUSTER + + # If true, the owners field of the DAG will be capture as a DataHub corpuser. + capture_ownership_info: bool = True + + # If true, the tags field of the DAG will be captured as DataHub tags. + capture_tags_info: bool = True + + capture_executions: bool = False + + enable_extractors: bool = True + + log_level: Optional[str] = None + debug_emitter: bool = False + + disable_openlineage_plugin: bool = True + + # Note that this field is only respected by the lineage backend. + # The Airflow plugin behaves as if it were set to True. + graceful_exceptions: bool = True + + def make_emitter_hook(self) -> "DatahubGenericHook": + # This is necessary to avoid issues with circular imports. + from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook + + return DatahubGenericHook(self.datahub_conn_id) + + +def get_lineage_config() -> DatahubLineageConfig: + """Load the DataHub plugin config from airflow.cfg.""" + + enabled = conf.get("datahub", "enabled", fallback=True) + datahub_conn_id = conf.get("datahub", "conn_id", fallback="datahub_rest_default") + cluster = conf.get("datahub", "cluster", fallback=builder.DEFAULT_FLOW_CLUSTER) + capture_tags_info = conf.get("datahub", "capture_tags_info", fallback=True) + capture_ownership_info = conf.get( + "datahub", "capture_ownership_info", fallback=True + ) + capture_executions = conf.get("datahub", "capture_executions", fallback=True) + enable_extractors = conf.get("datahub", "enable_extractors", fallback=True) + log_level = conf.get("datahub", "log_level", fallback=None) + debug_emitter = conf.get("datahub", "debug_emitter", fallback=False) + disable_openlineage_plugin = conf.get( + "datahub", "disable_openlineage_plugin", fallback=True + ) + + return DatahubLineageConfig( + enabled=enabled, + datahub_conn_id=datahub_conn_id, + cluster=cluster, + capture_ownership_info=capture_ownership_info, + capture_tags_info=capture_tags_info, + capture_executions=capture_executions, + enable_extractors=enable_extractors, + log_level=log_level, + debug_emitter=debug_emitter, + disable_openlineage_plugin=disable_openlineage_plugin, + ) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py new file mode 100644 index 00000000000000..f39d37b1222285 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_listener_module.py @@ -0,0 +1,7 @@ +from datahub_airflow_plugin.datahub_listener import get_airflow_plugin_listener + +_listener = get_airflow_plugin_listener() +if _listener: + on_task_instance_running = _listener.on_task_instance_running + on_task_instance_success = _listener.on_task_instance_success + on_task_instance_failed = _listener.on_task_instance_failed diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py new file mode 100644 index 00000000000000..7d35791bf1db42 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_datahub_ol_adapter.py @@ -0,0 +1,23 @@ +import logging + +import datahub.emitter.mce_builder as builder +from openlineage.client.run import Dataset as OpenLineageDataset + +logger = logging.getLogger(__name__) + + +OL_SCHEME_TWEAKS = { + "sqlserver": "mssql", + "trino": "presto", + "awsathena": "athena", +} + + +def translate_ol_to_datahub_urn(ol_uri: OpenLineageDataset) -> str: + namespace = ol_uri.namespace + name = ol_uri.name + + scheme, *rest = namespace.split("://", maxsplit=1) + + platform = OL_SCHEME_TWEAKS.get(scheme, scheme) + return builder.make_dataset_urn(platform=platform, name=name) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py new file mode 100644 index 00000000000000..f84b7b56f61191 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_extractors.py @@ -0,0 +1,244 @@ +import contextlib +import logging +import unittest.mock +from typing import TYPE_CHECKING, Optional + +import datahub.emitter.mce_builder as builder +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) +from datahub.utilities.sqlglot_lineage import ( + SqlParsingResult, + create_lineage_sql_parsed_result, +) +from openlineage.airflow.extractors import BaseExtractor +from openlineage.airflow.extractors import ExtractorManager as OLExtractorManager +from openlineage.airflow.extractors import TaskMetadata +from openlineage.airflow.extractors.snowflake_extractor import SnowflakeExtractor +from openlineage.airflow.extractors.sql_extractor import SqlExtractor +from openlineage.airflow.utils import get_operator_class, try_import_from_string +from openlineage.client.facet import ( + ExtractionError, + ExtractionErrorRunFacet, + SqlJobFacet, +) + +from datahub_airflow_plugin._airflow_shims import Operator +from datahub_airflow_plugin._datahub_ol_adapter import OL_SCHEME_TWEAKS + +if TYPE_CHECKING: + from airflow.models import DagRun, TaskInstance + from datahub.ingestion.graph.client import DataHubGraph + +logger = logging.getLogger(__name__) +_DATAHUB_GRAPH_CONTEXT_KEY = "datahub_graph" +SQL_PARSING_RESULT_KEY = "datahub_sql" + + +class ExtractorManager(OLExtractorManager): + # TODO: On Airflow 2.7, the OLExtractorManager is part of the built-in Airflow API. + # When available, we should use that instead. The same goe for most of the OL + # extractors. + + def __init__(self): + super().__init__() + + _sql_operator_overrides = [ + # The OL BigQuery extractor has some complex logic to fetch detect + # the BigQuery job_id and fetch lineage from there. However, it can't + # generate CLL, so we disable it and use our own extractor instead. + "BigQueryOperator", + "BigQueryExecuteQueryOperator", + # Athena also does something similar. + "AthenaOperator", + "AWSAthenaOperator", + # Additional types that OL doesn't support. This is only necessary because + # on older versions of Airflow, these operators don't inherit from SQLExecuteQueryOperator. + "SqliteOperator", + ] + for operator in _sql_operator_overrides: + self.task_to_extractor.extractors[operator] = GenericSqlExtractor + + self._graph: Optional["DataHubGraph"] = None + + @contextlib.contextmanager + def _patch_extractors(self): + with contextlib.ExitStack() as stack: + # Patch the SqlExtractor.extract() method. + stack.enter_context( + unittest.mock.patch.object( + SqlExtractor, + "extract", + _sql_extractor_extract, + ) + ) + + # Patch the SnowflakeExtractor.default_schema property. + stack.enter_context( + unittest.mock.patch.object( + SnowflakeExtractor, + "default_schema", + property(snowflake_default_schema), + ) + ) + + # TODO: Override the BigQuery extractor to use the DataHub SQL parser. + # self.extractor_manager.add_extractor() + + # TODO: Override the Athena extractor to use the DataHub SQL parser. + + yield + + def extract_metadata( + self, + dagrun: "DagRun", + task: "Operator", + complete: bool = False, + task_instance: Optional["TaskInstance"] = None, + task_uuid: Optional[str] = None, + graph: Optional["DataHubGraph"] = None, + ) -> TaskMetadata: + self._graph = graph + with self._patch_extractors(): + return super().extract_metadata( + dagrun, task, complete, task_instance, task_uuid + ) + + def _get_extractor(self, task: "Operator") -> Optional[BaseExtractor]: + # By adding this, we can use the generic extractor as a fallback for + # any operator that inherits from SQLExecuteQueryOperator. + clazz = get_operator_class(task) + SQLExecuteQueryOperator = try_import_from_string( + "airflow.providers.common.sql.operators.sql.SQLExecuteQueryOperator" + ) + if SQLExecuteQueryOperator and issubclass(clazz, SQLExecuteQueryOperator): + self.task_to_extractor.extractors.setdefault( + clazz.__name__, GenericSqlExtractor + ) + + extractor = super()._get_extractor(task) + if extractor: + extractor.set_context(_DATAHUB_GRAPH_CONTEXT_KEY, self._graph) + return extractor + + +class GenericSqlExtractor(SqlExtractor): + # Note that the extract() method is patched elsewhere. + + @property + def default_schema(self): + return super().default_schema + + def _get_scheme(self) -> Optional[str]: + # Best effort conversion to DataHub platform names. + + with contextlib.suppress(Exception): + if self.hook: + if hasattr(self.hook, "get_uri"): + uri = self.hook.get_uri() + return get_platform_from_sqlalchemy_uri(uri) + + return self.conn.conn_type or super().dialect + + def _get_database(self) -> Optional[str]: + if self.conn: + # For BigQuery, the "database" is the project name. + if hasattr(self.conn, "project_id"): + return self.conn.project_id + + return self.conn.schema + return None + + +def _sql_extractor_extract(self: "SqlExtractor") -> TaskMetadata: + # Why not override the OL sql_parse method directly, instead of overriding + # extract()? A few reasons: + # + # 1. We would want to pass the default_db and graph instance into our sql parser + # method. The OL code doesn't pass the default_db (despite having it available), + # and it's not clear how to get the graph instance into that method. + # 2. OL has some janky logic to fetch table schemas as part of the sql extractor. + # We don't want that behavior and this lets us disable it. + # 3. Our SqlParsingResult already has DataHub urns, whereas using SqlMeta would + # require us to convert those urns to OL uris, just for them to get converted + # back to urns later on in our processing. + + task_name = f"{self.operator.dag_id}.{self.operator.task_id}" + sql = self.operator.sql + + run_facets = {} + job_facets = {"sql": SqlJobFacet(query=self._normalize_sql(sql))} + + # Prepare to run the SQL parser. + graph = self.context.get(_DATAHUB_GRAPH_CONTEXT_KEY, None) + + default_database = getattr(self.operator, "database", None) + if not default_database: + default_database = self.database + default_schema = self.default_schema + + # TODO: Add better handling for sql being a list of statements. + if isinstance(sql, list): + logger.info(f"Got list of SQL statements for {task_name}. Using first one.") + sql = sql[0] + + # Run the SQL parser. + scheme = self.scheme + platform = OL_SCHEME_TWEAKS.get(scheme, scheme) + self.log.debug( + "Running the SQL parser %s (platform=%s, default db=%s, schema=%s): %s", + "with graph client" if graph else "in offline mode", + platform, + default_database, + default_schema, + sql, + ) + sql_parsing_result: SqlParsingResult = create_lineage_sql_parsed_result( + query=sql, + graph=graph, + platform=platform, + platform_instance=None, + env=builder.DEFAULT_ENV, + database=default_database, + schema=default_schema, + ) + self.log.debug(f"Got sql lineage {sql_parsing_result}") + + if sql_parsing_result.debug_info.error: + error = sql_parsing_result.debug_info.error + run_facets["extractionError"] = ExtractionErrorRunFacet( + totalTasks=1, + failedTasks=1, + errors=[ + ExtractionError( + errorMessage=str(error), + stackTrace=None, + task="datahub_sql_parser", + taskNumber=None, + ) + ], + ) + + # Save sql_parsing_result to the facets dict. It is removed from the + # facet dict in the extractor's processing logic. + run_facets[SQL_PARSING_RESULT_KEY] = sql_parsing_result # type: ignore + + return TaskMetadata( + name=task_name, + inputs=[], + outputs=[], + run_facets=run_facets, + job_facets=job_facets, + ) + + +def snowflake_default_schema(self: "SnowflakeExtractor") -> Optional[str]: + if hasattr(self.operator, "schema") and self.operator.schema is not None: + return self.operator.schema + return ( + self.conn.extra_dejson.get("extra__snowflake__schema", "") + or self.conn.extra_dejson.get("schema", "") + or self.conn.schema + ) + # TODO: Should we try a fallback of: + # execute_query_on_hook(self.hook, "SELECT current_schema();")[0][0] diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py index b5e86e14d85d0f..e1d53be7bae6b9 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/client/airflow_generator.py @@ -1,4 +1,5 @@ -from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union, cast +from datetime import datetime +from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple, Union, cast from airflow.configuration import conf from datahub.api.entities.datajob import DataFlow, DataJob @@ -6,6 +7,7 @@ DataProcessInstance, InstanceRunResult, ) +from datahub.emitter.generic_emitter import Emitter from datahub.metadata.schema_classes import DataProcessTypeClass from datahub.utilities.urns.data_flow_urn import DataFlowUrn from datahub.utilities.urns.data_job_urn import DataJobUrn @@ -17,8 +19,6 @@ if TYPE_CHECKING: from airflow import DAG from airflow.models import DagRun, TaskInstance - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub_airflow_plugin._airflow_shims import Operator @@ -91,14 +91,14 @@ def _get_dependencies( ) # if the task triggers the subdag, link it to this node in the subdag - if subdag_task_id in _task_downstream_task_ids(upstream_task): + if subdag_task_id in sorted(_task_downstream_task_ids(upstream_task)): upstream_subdag_triggers.append(upstream_task_urn) # If the operator is an ExternalTaskSensor then we set the remote task as upstream. # It is possible to tie an external sensor to DAG if external_task_id is omitted but currently we can't tie # jobflow to anothet jobflow. external_task_upstreams = [] - if task.task_type == "ExternalTaskSensor": + if isinstance(task, ExternalTaskSensor): task = cast(ExternalTaskSensor, task) if hasattr(task, "external_task_id") and task.external_task_id is not None: external_task_upstreams = [ @@ -143,7 +143,7 @@ def generate_dataflow( """ id = dag.dag_id orchestrator = "airflow" - description = f"{dag.description}\n\n{dag.doc_md or ''}" + description = "\n\n".join(filter(None, [dag.description, dag.doc_md])) or None data_flow = DataFlow( env=cluster, id=id, orchestrator=orchestrator, description=description ) @@ -153,8 +153,10 @@ def generate_dataflow( allowed_flow_keys = [ "_access_control", "_concurrency", - "_default_view", + # "_default_view", "catchup", + "description", + "doc_md", "fileloc", "is_paused_upon_creation", "start_date", @@ -171,7 +173,7 @@ def generate_dataflow( data_flow.url = f"{base_url}/tree?dag_id={dag.dag_id}" if capture_owner and dag.owner: - data_flow.owners.add(dag.owner) + data_flow.owners.update(owner.strip() for owner in dag.owner.split(",")) if capture_tags and dag.tags: data_flow.tags.update(dag.tags) @@ -227,10 +229,7 @@ def generate_datajob( job_property_bag: Dict[str, str] = {} - allowed_task_keys = [ - "_downstream_task_ids", - "_inlets", - "_outlets", + allowed_task_keys: List[Union[str, Tuple[str, ...]]] = [ "_task_type", "_task_module", "depends_on_past", @@ -243,15 +242,28 @@ def generate_datajob( "trigger_rule", "wait_for_downstream", # In Airflow 2.3, _downstream_task_ids was renamed to downstream_task_ids - "downstream_task_ids", + ("downstream_task_ids", "_downstream_task_ids"), # In Airflow 2.4, _inlets and _outlets were removed in favor of non-private versions. - "inlets", - "outlets", + ("inlets", "_inlets"), + ("outlets", "_outlets"), ] for key in allowed_task_keys: - if hasattr(task, key): - job_property_bag[key] = repr(getattr(task, key)) + if isinstance(key, tuple): + out_key: str = key[0] + try_keys = key + else: + out_key = key + try_keys = (key,) + + for k in try_keys: + if hasattr(task, k): + v = getattr(task, k) + if out_key == "downstream_task_ids": + # Generate these in a consistent order. + v = list(sorted(v)) + job_property_bag[out_key] = repr(v) + break datajob.properties = job_property_bag base_url = conf.get("webserver", "base_url") @@ -288,7 +300,7 @@ def create_datajob_instance( @staticmethod def run_dataflow( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, cluster: str, dag_run: "DagRun", start_timestamp_millis: Optional[int] = None, @@ -340,7 +352,7 @@ def run_dataflow( @staticmethod def complete_dataflow( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, cluster: str, dag_run: "DagRun", end_timestamp_millis: Optional[int] = None, @@ -348,7 +360,7 @@ def complete_dataflow( ) -> None: """ - :param emitter: DatahubRestEmitter - the datahub rest emitter to emit the generated mcps + :param emitter: Emitter - the datahub emitter to emit the generated mcps :param cluster: str - name of the cluster :param dag_run: DagRun :param end_timestamp_millis: Optional[int] - the completion time in milliseconds if not set the current time will be used. @@ -386,7 +398,7 @@ def complete_dataflow( @staticmethod def run_datajob( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, cluster: str, ti: "TaskInstance", dag: "DAG", @@ -413,17 +425,17 @@ def run_datajob( job_property_bag["end_date"] = str(ti.end_date) job_property_bag["execution_date"] = str(ti.execution_date) job_property_bag["try_number"] = str(ti.try_number - 1) - job_property_bag["hostname"] = str(ti.hostname) job_property_bag["max_tries"] = str(ti.max_tries) # Not compatible with Airflow 1 if hasattr(ti, "external_executor_id"): job_property_bag["external_executor_id"] = str(ti.external_executor_id) - job_property_bag["pid"] = str(ti.pid) job_property_bag["state"] = str(ti.state) job_property_bag["operator"] = str(ti.operator) job_property_bag["priority_weight"] = str(ti.priority_weight) - job_property_bag["unixname"] = str(ti.unixname) job_property_bag["log_url"] = ti.log_url + job_property_bag["orchestrator"] = "airflow" + job_property_bag["dag_id"] = str(dag.dag_id) + job_property_bag["task_id"] = str(ti.task_id) dpi.properties.update(job_property_bag) dpi.url = ti.log_url @@ -442,8 +454,10 @@ def run_datajob( dpi.type = DataProcessTypeClass.BATCH_AD_HOC if start_timestamp_millis is None: - assert ti.start_date - start_timestamp_millis = int(ti.start_date.timestamp() * 1000) + if ti.start_date: + start_timestamp_millis = int(ti.start_date.timestamp() * 1000) + else: + start_timestamp_millis = int(datetime.now().timestamp() * 1000) if attempt is None: attempt = ti.try_number @@ -458,7 +472,7 @@ def run_datajob( @staticmethod def complete_datajob( - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, cluster: str, ti: "TaskInstance", dag: "DAG", @@ -469,7 +483,7 @@ def complete_datajob( ) -> DataProcessInstance: """ - :param emitter: DatahubRestEmitter + :param emitter: Emitter - the datahub emitter to emit the generated mcps :param cluster: str :param ti: TaskInstance :param dag: DAG @@ -483,8 +497,10 @@ def complete_datajob( datajob = AirflowGenerator.generate_datajob(cluster, ti.task, dag) if end_timestamp_millis is None: - assert ti.end_date - end_timestamp_millis = int(ti.end_date.timestamp() * 1000) + if ti.end_date: + end_timestamp_millis = int(ti.end_date.timestamp() * 1000) + else: + end_timestamp_millis = int(datetime.now().timestamp() * 1000) if result is None: # We should use TaskInstanceState but it is not available in Airflow 1 diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py new file mode 100644 index 00000000000000..a3f5cb489e29fb --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_listener.py @@ -0,0 +1,494 @@ +import copy +import functools +import logging +import threading +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, TypeVar, cast + +import airflow +import datahub.emitter.mce_builder as builder +from datahub.api.entities.datajob import DataJob +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult +from datahub.emitter.rest_emitter import DatahubRestEmitter +from datahub.ingestion.graph.client import DataHubGraph +from datahub.metadata.schema_classes import ( + FineGrainedLineageClass, + FineGrainedLineageDownstreamTypeClass, + FineGrainedLineageUpstreamTypeClass, +) +from datahub.telemetry import telemetry +from datahub.utilities.sqlglot_lineage import SqlParsingResult +from datahub.utilities.urns.dataset_urn import DatasetUrn +from openlineage.airflow.listener import TaskHolder +from openlineage.airflow.utils import redact_with_exclusions +from openlineage.client.serde import Serde + +from datahub_airflow_plugin._airflow_shims import ( + HAS_AIRFLOW_DAG_LISTENER_API, + Operator, + get_task_inlets, + get_task_outlets, +) +from datahub_airflow_plugin._config import DatahubLineageConfig, get_lineage_config +from datahub_airflow_plugin._datahub_ol_adapter import translate_ol_to_datahub_urn +from datahub_airflow_plugin._extractors import SQL_PARSING_RESULT_KEY, ExtractorManager +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator +from datahub_airflow_plugin.entities import _Entity + +_F = TypeVar("_F", bound=Callable[..., None]) +if TYPE_CHECKING: + from airflow.models import DAG, DagRun, TaskInstance + from sqlalchemy.orm import Session + + # To placate mypy on Airflow versions that don't have the listener API, + # we define a dummy hookimpl that's an identity function. + + def hookimpl(f: _F) -> _F: # type: ignore[misc] # noqa: F811 + return f + +else: + from airflow.listeners import hookimpl + +logger = logging.getLogger(__name__) + +_airflow_listener_initialized = False +_airflow_listener: Optional["DataHubListener"] = None +_RUN_IN_THREAD = True +_RUN_IN_THREAD_TIMEOUT = 30 + + +def get_airflow_plugin_listener() -> Optional["DataHubListener"]: + # Using globals instead of functools.lru_cache to make testing easier. + global _airflow_listener_initialized + global _airflow_listener + + if not _airflow_listener_initialized: + _airflow_listener_initialized = True + + plugin_config = get_lineage_config() + + if plugin_config.enabled: + _airflow_listener = DataHubListener(config=plugin_config) + + if plugin_config.disable_openlineage_plugin: + # Deactivate the OpenLineagePlugin listener to avoid conflicts. + from openlineage.airflow.plugin import OpenLineagePlugin + + OpenLineagePlugin.listeners = [] + + telemetry.telemetry_instance.ping( + "airflow-plugin-init", + { + "airflow-version": airflow.__version__, + "datahub-airflow-plugin": "v2", + "datahub-airflow-plugin-dag-events": HAS_AIRFLOW_DAG_LISTENER_API, + "capture_executions": plugin_config.capture_executions, + "capture_tags": plugin_config.capture_tags_info, + "capture_ownership": plugin_config.capture_ownership_info, + "enable_extractors": plugin_config.enable_extractors, + "disable_openlineage_plugin": plugin_config.disable_openlineage_plugin, + }, + ) + return _airflow_listener + + +def run_in_thread(f: _F) -> _F: + # This is also responsible for catching exceptions and logging them. + + @functools.wraps(f) + def wrapper(*args, **kwargs): + try: + if _RUN_IN_THREAD: + # A poor-man's timeout mechanism. + # This ensures that we don't hang the task if the extractors + # are slow or the DataHub API is slow to respond. + + thread = threading.Thread( + target=f, args=args, kwargs=kwargs, daemon=True + ) + thread.start() + + thread.join(timeout=_RUN_IN_THREAD_TIMEOUT) + if thread.is_alive(): + logger.warning( + f"Thread for {f.__name__} is still running after {_RUN_IN_THREAD_TIMEOUT} seconds. " + "Continuing without waiting for it to finish." + ) + else: + f(*args, **kwargs) + except Exception as e: + logger.exception(e) + + return cast(_F, wrapper) + + +class DataHubListener: + __name__ = "DataHubListener" + + def __init__(self, config: DatahubLineageConfig): + self.config = config + self._set_log_level() + + self._emitter = config.make_emitter_hook().make_emitter() + self._graph: Optional[DataHubGraph] = None + logger.info(f"DataHub plugin using {repr(self._emitter)}") + + # See discussion here https://github.com/OpenLineage/OpenLineage/pull/508 for + # why we need to keep track of tasks ourselves. + self._task_holder = TaskHolder() + + # In our case, we also want to cache the initial datajob object + # so that we can add to it when the task completes. + self._datajob_holder: Dict[str, DataJob] = {} + + self.extractor_manager = ExtractorManager() + + # This "inherits" from types.ModuleType to avoid issues with Airflow's listener plugin loader. + # It previously (v2.4.x and likely other versions too) would throw errors if it was not a module. + # https://github.com/apache/airflow/blob/e99a518970b2d349a75b1647f6b738c8510fa40e/airflow/listeners/listener.py#L56 + # self.__class__ = types.ModuleType + + @property + def emitter(self): + return self._emitter + + @property + def graph(self) -> Optional[DataHubGraph]: + if self._graph: + return self._graph + + if isinstance(self._emitter, DatahubRestEmitter) and not isinstance( + self._emitter, DataHubGraph + ): + # This is lazy initialized to avoid throwing errors on plugin load. + self._graph = self._emitter.to_graph() + self._emitter = self._graph + + return self._graph + + def _set_log_level(self) -> None: + """Set the log level for the plugin and its dependencies. + + This may need to be called multiple times, since Airflow sometimes + messes with the logging configuration after the plugin is loaded. + In particular, the loggers may get changed when the worker starts + executing a task. + """ + + if self.config.log_level: + logging.getLogger(__name__.split(".")[0]).setLevel(self.config.log_level) + if self.config.debug_emitter: + logging.getLogger("datahub.emitter").setLevel(logging.DEBUG) + + def _make_emit_callback(self) -> Callable[[Optional[Exception], str], None]: + def emit_callback(err: Optional[Exception], msg: str) -> None: + if err: + logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) + + return emit_callback + + def _extract_lineage( + self, + datajob: DataJob, + dagrun: "DagRun", + task: "Operator", + task_instance: "TaskInstance", + complete: bool = False, + ) -> None: + """ + Combine lineage (including column lineage) from task inlets/outlets and + extractor-generated task_metadata and write it to the datajob. This + routine is also responsible for converting the lineage to DataHub URNs. + """ + + input_urns: List[str] = [] + output_urns: List[str] = [] + fine_grained_lineages: List[FineGrainedLineageClass] = [] + + task_metadata = None + if self.config.enable_extractors: + task_metadata = self.extractor_manager.extract_metadata( + dagrun, + task, + complete=complete, + task_instance=task_instance, + task_uuid=str(datajob.urn), + graph=self.graph, + ) + logger.debug(f"Got task metadata: {task_metadata}") + + # Translate task_metadata.inputs/outputs to DataHub URNs. + input_urns.extend( + translate_ol_to_datahub_urn(dataset) for dataset in task_metadata.inputs + ) + output_urns.extend( + translate_ol_to_datahub_urn(dataset) + for dataset in task_metadata.outputs + ) + + # Add DataHub-native SQL parser results. + sql_parsing_result: Optional[SqlParsingResult] = None + if task_metadata: + sql_parsing_result = task_metadata.run_facets.pop( + SQL_PARSING_RESULT_KEY, None + ) + if sql_parsing_result: + if sql_parsing_result.debug_info.error: + datajob.properties["datahub_sql_parser_error"] = str( + sql_parsing_result.debug_info.error + ) + if not sql_parsing_result.debug_info.table_error: + input_urns.extend(sql_parsing_result.in_tables) + output_urns.extend(sql_parsing_result.out_tables) + + if sql_parsing_result.column_lineage: + fine_grained_lineages.extend( + FineGrainedLineageClass( + upstreamType=FineGrainedLineageUpstreamTypeClass.FIELD_SET, + downstreamType=FineGrainedLineageDownstreamTypeClass.FIELD, + upstreams=[ + builder.make_schema_field_urn( + upstream.table, upstream.column + ) + for upstream in column_lineage.upstreams + ], + downstreams=[ + builder.make_schema_field_urn( + downstream.table, downstream.column + ) + for downstream in [column_lineage.downstream] + if downstream.table + ], + ) + for column_lineage in sql_parsing_result.column_lineage + ) + + # Add DataHub-native inlets/outlets. + # These are filtered out by the extractor, so we need to add them manually. + input_urns.extend( + iolet.urn for iolet in get_task_inlets(task) if isinstance(iolet, _Entity) + ) + output_urns.extend( + iolet.urn for iolet in get_task_outlets(task) if isinstance(iolet, _Entity) + ) + + # Write the lineage to the datajob object. + datajob.inlets.extend(DatasetUrn.create_from_string(urn) for urn in input_urns) + datajob.outlets.extend( + DatasetUrn.create_from_string(urn) for urn in output_urns + ) + datajob.fine_grained_lineages.extend(fine_grained_lineages) + + # Merge in extra stuff that was present in the DataJob we constructed + # at the start of the task. + if complete: + original_datajob = self._datajob_holder.get(str(datajob.urn), None) + else: + self._datajob_holder[str(datajob.urn)] = datajob + original_datajob = None + + if original_datajob: + logger.debug("Merging start datajob into finish datajob") + datajob.inlets.extend(original_datajob.inlets) + datajob.outlets.extend(original_datajob.outlets) + datajob.fine_grained_lineages.extend(original_datajob.fine_grained_lineages) + + for k, v in original_datajob.properties.items(): + datajob.properties.setdefault(k, v) + + # Deduplicate inlets/outlets. + datajob.inlets = list(sorted(set(datajob.inlets), key=lambda x: str(x))) + datajob.outlets = list(sorted(set(datajob.outlets), key=lambda x: str(x))) + + # Write all other OL facets as DataHub properties. + if task_metadata: + for k, v in task_metadata.job_facets.items(): + datajob.properties[f"openlineage_job_facet_{k}"] = Serde.to_json( + redact_with_exclusions(v) + ) + + for k, v in task_metadata.run_facets.items(): + datajob.properties[f"openlineage_run_facet_{k}"] = Serde.to_json( + redact_with_exclusions(v) + ) + + @hookimpl + @run_in_thread + def on_task_instance_running( + self, + previous_state: None, + task_instance: "TaskInstance", + session: "Session", # This will always be QUEUED + ) -> None: + self._set_log_level() + + # This if statement mirrors the logic in https://github.com/OpenLineage/OpenLineage/pull/508. + if not hasattr(task_instance, "task"): + # The type ignore is to placate mypy on Airflow 2.1.x. + logger.warning( + f"No task set for task_id: {task_instance.task_id} - " # type: ignore[attr-defined] + f"dag_id: {task_instance.dag_id} - run_id {task_instance.run_id}" # type: ignore[attr-defined] + ) + return + + logger.debug( + f"DataHub listener got notification about task instance start for {task_instance.task_id}" + ) + + # Render templates in a copy of the task instance. + # This is necessary to get the correct operator args in the extractors. + task_instance = copy.deepcopy(task_instance) + task_instance.render_templates() + + # The type ignore is to placate mypy on Airflow 2.1.x. + dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined] + task = task_instance.task + dag: "DAG" = task.dag # type: ignore[assignment] + + self._task_holder.set_task(task_instance) + + # Handle async operators in Airflow 2.3 by skipping deferred state. + # Inspired by https://github.com/OpenLineage/OpenLineage/pull/1601 + if task_instance.next_method is not None: # type: ignore[attr-defined] + return + + # If we don't have the DAG listener API, we just pretend that + # the start of the task is the start of the DAG. + # This generates duplicate events, but it's better than not + # generating anything. + if not HAS_AIRFLOW_DAG_LISTENER_API: + self.on_dag_start(dagrun) + + datajob = AirflowGenerator.generate_datajob( + cluster=self.config.cluster, + task=task, + dag=dag, + capture_tags=self.config.capture_tags_info, + capture_owner=self.config.capture_ownership_info, + ) + + # TODO: Make use of get_task_location to extract github urls. + + # Add lineage info. + self._extract_lineage(datajob, dagrun, task, task_instance) + + # TODO: Add handling for Airflow mapped tasks using task_instance.map_index + + datajob.emit(self.emitter, callback=self._make_emit_callback()) + logger.debug(f"Emitted DataHub Datajob start: {datajob}") + + if self.config.capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=self.emitter, + cluster=self.config.cluster, + ti=task_instance, + dag=dag, + dag_run=dagrun, + datajob=datajob, + emit_templates=False, + ) + logger.debug(f"Emitted DataHub DataProcess Instance start: {dpi}") + + self.emitter.flush() + + logger.debug( + f"DataHub listener finished processing notification about task instance start for {task_instance.task_id}" + ) + + def on_task_instance_finish( + self, task_instance: "TaskInstance", status: InstanceRunResult + ) -> None: + dagrun: "DagRun" = task_instance.dag_run # type: ignore[attr-defined] + task = self._task_holder.get_task(task_instance) or task_instance.task + dag: "DAG" = task.dag # type: ignore[assignment] + + datajob = AirflowGenerator.generate_datajob( + cluster=self.config.cluster, + task=task, + dag=dag, + capture_tags=self.config.capture_tags_info, + capture_owner=self.config.capture_ownership_info, + ) + + # Add lineage info. + self._extract_lineage(datajob, dagrun, task, task_instance, complete=True) + + datajob.emit(self.emitter, callback=self._make_emit_callback()) + logger.debug(f"Emitted DataHub Datajob finish w/ status {status}: {datajob}") + + if self.config.capture_executions: + dpi = AirflowGenerator.complete_datajob( + emitter=self.emitter, + cluster=self.config.cluster, + ti=task_instance, + dag=dag, + dag_run=dagrun, + datajob=datajob, + result=status, + ) + logger.debug( + f"Emitted DataHub DataProcess Instance with status {status}: {dpi}" + ) + + self.emitter.flush() + + @hookimpl + @run_in_thread + def on_task_instance_success( + self, previous_state: None, task_instance: "TaskInstance", session: "Session" + ) -> None: + self._set_log_level() + + logger.debug( + f"DataHub listener got notification about task instance success for {task_instance.task_id}" + ) + self.on_task_instance_finish(task_instance, status=InstanceRunResult.SUCCESS) + logger.debug( + f"DataHub listener finished processing task instance success for {task_instance.task_id}" + ) + + @hookimpl + @run_in_thread + def on_task_instance_failed( + self, previous_state: None, task_instance: "TaskInstance", session: "Session" + ) -> None: + self._set_log_level() + + logger.debug( + f"DataHub listener got notification about task instance failure for {task_instance.task_id}" + ) + + # TODO: Handle UP_FOR_RETRY state. + self.on_task_instance_finish(task_instance, status=InstanceRunResult.FAILURE) + logger.debug( + f"DataHub listener finished processing task instance failure for {task_instance.task_id}" + ) + + def on_dag_start(self, dag_run: "DagRun") -> None: + dag = dag_run.dag + if not dag: + return + + dataflow = AirflowGenerator.generate_dataflow( + cluster=self.config.cluster, + dag=dag, + capture_tags=self.config.capture_tags_info, + capture_owner=self.config.capture_ownership_info, + ) + dataflow.emit(self.emitter, callback=self._make_emit_callback()) + + if HAS_AIRFLOW_DAG_LISTENER_API: + + @hookimpl + @run_in_thread + def on_dag_run_running(self, dag_run: "DagRun", msg: str) -> None: + self._set_log_level() + + logger.debug( + f"DataHub listener got notification about dag run start for {dag_run.dag_id}" + ) + + self.on_dag_start(dag_run) + + self.emitter.flush() + + # TODO: Add hooks for on_dag_run_success, on_dag_run_failed -> call AirflowGenerator.complete_dataflow diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py index d1cec9e5c1b54f..c96fab31647f50 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin.py @@ -1,367 +1,74 @@ import contextlib import logging -import traceback -from typing import Any, Callable, Iterable, List, Optional, Union +import os -from airflow.configuration import conf -from airflow.lineage import PIPELINE_OUTLETS -from airflow.models.baseoperator import BaseOperator from airflow.plugins_manager import AirflowPlugin -from airflow.utils.module_loading import import_string -from cattr import structure -from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult from datahub_airflow_plugin._airflow_compat import AIRFLOW_PATCHED -from datahub_airflow_plugin._airflow_shims import MappedOperator, Operator -from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator -from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook -from datahub_airflow_plugin.lineage.datahub import DatahubLineageConfig +from datahub_airflow_plugin._airflow_shims import ( + HAS_AIRFLOW_DAG_LISTENER_API, + HAS_AIRFLOW_LISTENER_API, +) assert AIRFLOW_PATCHED logger = logging.getLogger(__name__) -TASK_ON_FAILURE_CALLBACK = "on_failure_callback" -TASK_ON_SUCCESS_CALLBACK = "on_success_callback" +_USE_AIRFLOW_LISTENER_INTERFACE = HAS_AIRFLOW_LISTENER_API and not os.getenv( + "DATAHUB_AIRFLOW_PLUGIN_USE_V1_PLUGIN", "false" +).lower() in ("true", "1") -def get_lineage_config() -> DatahubLineageConfig: - """Load the lineage config from airflow.cfg.""" +if _USE_AIRFLOW_LISTENER_INTERFACE: + try: + from openlineage.airflow.utils import try_import_from_string # noqa: F401 + except ImportError: + # If v2 plugin dependencies are not installed, we fall back to v1. + logger.debug("Falling back to v1 plugin due to missing dependencies.") + _USE_AIRFLOW_LISTENER_INTERFACE = False - enabled = conf.get("datahub", "enabled", fallback=True) - datahub_conn_id = conf.get("datahub", "conn_id", fallback="datahub_rest_default") - cluster = conf.get("datahub", "cluster", fallback="prod") - graceful_exceptions = conf.get("datahub", "graceful_exceptions", fallback=True) - capture_tags_info = conf.get("datahub", "capture_tags_info", fallback=True) - capture_ownership_info = conf.get( - "datahub", "capture_ownership_info", fallback=True - ) - capture_executions = conf.get("datahub", "capture_executions", fallback=True) - return DatahubLineageConfig( - enabled=enabled, - datahub_conn_id=datahub_conn_id, - cluster=cluster, - graceful_exceptions=graceful_exceptions, - capture_ownership_info=capture_ownership_info, - capture_tags_info=capture_tags_info, - capture_executions=capture_executions, - ) +with contextlib.suppress(Exception): + if not os.getenv("DATAHUB_AIRFLOW_PLUGIN_SKIP_FORK_PATCH", "false").lower() in ( + "true", + "1", + ): + # From https://github.com/apache/airflow/discussions/24463#discussioncomment-4404542 + # I'm not exactly sure why this fixes it, but I suspect it's that this + # forces the proxy settings to get cached before the fork happens. + # + # For more details, see https://github.com/python/cpython/issues/58037 + # and https://wefearchange.org/2018/11/forkmacos.rst.html + # and https://bugs.python.org/issue30385#msg293958 + # An alternative fix is to set NO_PROXY='*' -def _task_inlets(operator: "Operator") -> List: - # From Airflow 2.4 _inlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _inlets - if hasattr(operator, "_inlets"): - return operator._inlets # type: ignore[attr-defined, union-attr] - return operator.inlets + from _scproxy import _get_proxy_settings + _get_proxy_settings() -def _task_outlets(operator: "Operator") -> List: - # From Airflow 2.4 _outlets is dropped and inlets used consistently. Earlier it was not the case, so we have to stick there to _outlets - # We have to use _outlets because outlets is empty in Airflow < 2.4.0 - if hasattr(operator, "_outlets"): - return operator._outlets # type: ignore[attr-defined, union-attr] - return operator.outlets +class DatahubPlugin(AirflowPlugin): + name = "datahub_plugin" -def get_inlets_from_task(task: BaseOperator, context: Any) -> Iterable[Any]: - # TODO: Fix for https://github.com/apache/airflow/commit/1b1f3fabc5909a447a6277cafef3a0d4ef1f01ae - # in Airflow 2.4. - # TODO: ignore/handle airflow's dataset type in our lineage - - inlets: List[Any] = [] - task_inlets = _task_inlets(task) - # From Airflow 2.3 this should be AbstractOperator but due to compatibility reason lets use BaseOperator - if isinstance(task_inlets, (str, BaseOperator)): - inlets = [ - task_inlets, - ] - - if task_inlets and isinstance(task_inlets, list): - inlets = [] - task_ids = ( - {o for o in task_inlets if isinstance(o, str)} - .union(op.task_id for op in task_inlets if isinstance(op, BaseOperator)) - .intersection(task.get_flat_relative_ids(upstream=True)) - ) - - from airflow.lineage import AUTO - - # pick up unique direct upstream task_ids if AUTO is specified - if AUTO.upper() in task_inlets or AUTO.lower() in task_inlets: - print("Picking up unique direct upstream task_ids as AUTO is specified") - task_ids = task_ids.union( - task_ids.symmetric_difference(task.upstream_task_ids) - ) - - inlets = task.xcom_pull( - context, task_ids=list(task_ids), dag_id=task.dag_id, key=PIPELINE_OUTLETS - ) - - # re-instantiate the obtained inlets - inlets = [ - structure(item["data"], import_string(item["type_name"])) - # _get_instance(structure(item, Metadata)) - for sublist in inlets - if sublist - for item in sublist - ] - - for inlet in task_inlets: - if not isinstance(inlet, str): - inlets.append(inlet) - - return inlets - - -def _make_emit_callback( - logger: logging.Logger, -) -> Callable[[Optional[Exception], str], None]: - def emit_callback(err: Optional[Exception], msg: str) -> None: - if err: - logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) - - return emit_callback - - -def datahub_task_status_callback(context, status): - ti = context["ti"] - task: "BaseOperator" = ti.task - dag = context["dag"] - - # This code is from the original airflow lineage code -> - # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py - inlets = get_inlets_from_task(task, context) - - emitter = ( - DatahubGenericHook(context["_datahub_config"].datahub_conn_id) - .get_underlying_hook() - .make_emitter() - ) - - dataflow = AirflowGenerator.generate_dataflow( - cluster=context["_datahub_config"].cluster, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - task.log.info(f"Emitting Datahub Dataflow: {dataflow}") - dataflow.emit(emitter, callback=_make_emit_callback(task.log)) - - datajob = AirflowGenerator.generate_datajob( - cluster=context["_datahub_config"].cluster, - task=task, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - - for inlet in inlets: - datajob.inlets.append(inlet.urn) - - task_outlets = _task_outlets(task) - for outlet in task_outlets: - datajob.outlets.append(outlet.urn) - - task.log.info(f"Emitting Datahub Datajob: {datajob}") - datajob.emit(emitter, callback=_make_emit_callback(task.log)) - - if context["_datahub_config"].capture_executions: - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag=dag, - dag_run=context["dag_run"], - datajob=datajob, - start_timestamp_millis=int(ti.start_date.timestamp() * 1000), - ) - - task.log.info(f"Emitted Start Datahub Dataprocess Instance: {dpi}") - - dpi = AirflowGenerator.complete_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag_run=context["dag_run"], - result=status, - dag=dag, - datajob=datajob, - end_timestamp_millis=int(ti.end_date.timestamp() * 1000), - ) - task.log.info(f"Emitted Completed Data Process Instance: {dpi}") - - emitter.flush() - - -def datahub_pre_execution(context): - ti = context["ti"] - task: "BaseOperator" = ti.task - dag = context["dag"] - - task.log.info("Running Datahub pre_execute method") - - emitter = ( - DatahubGenericHook(context["_datahub_config"].datahub_conn_id) - .get_underlying_hook() - .make_emitter() - ) - - # This code is from the original airflow lineage code -> - # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py - inlets = get_inlets_from_task(task, context) - - datajob = AirflowGenerator.generate_datajob( - cluster=context["_datahub_config"].cluster, - task=context["ti"].task, - dag=dag, - capture_tags=context["_datahub_config"].capture_tags_info, - capture_owner=context["_datahub_config"].capture_ownership_info, - ) - - for inlet in inlets: - datajob.inlets.append(inlet.urn) - - task_outlets = _task_outlets(task) - - for outlet in task_outlets: - datajob.outlets.append(outlet.urn) - - task.log.info(f"Emitting Datahub dataJob {datajob}") - datajob.emit(emitter, callback=_make_emit_callback(task.log)) - - if context["_datahub_config"].capture_executions: - dpi = AirflowGenerator.run_datajob( - emitter=emitter, - cluster=context["_datahub_config"].cluster, - ti=context["ti"], - dag=dag, - dag_run=context["dag_run"], - datajob=datajob, - start_timestamp_millis=int(ti.start_date.timestamp() * 1000), - ) - - task.log.info(f"Emitting Datahub Dataprocess Instance: {dpi}") - - emitter.flush() - - -def _wrap_pre_execution(pre_execution): - def custom_pre_execution(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - datahub_pre_execution(context) - - # Call original policy - if pre_execution: - pre_execution(context) - - return custom_pre_execution - - -def _wrap_on_failure_callback(on_failure_callback): - def custom_on_failure_callback(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - try: - datahub_task_status_callback(context, status=InstanceRunResult.FAILURE) - except Exception as e: - if not config.graceful_exceptions: - raise e - else: - print(f"Exception: {traceback.format_exc()}") - - # Call original policy - if on_failure_callback: - on_failure_callback(context) - - return custom_on_failure_callback - - -def _wrap_on_success_callback(on_success_callback): - def custom_on_success_callback(context): - config = get_lineage_config() - if config.enabled: - context["_datahub_config"] = config - try: - datahub_task_status_callback(context, status=InstanceRunResult.SUCCESS) - except Exception as e: - if not config.graceful_exceptions: - raise e - else: - print(f"Exception: {traceback.format_exc()}") - - # Call original policy - if on_success_callback: - on_success_callback(context) - - return custom_on_success_callback - - -def task_policy(task: Union[BaseOperator, MappedOperator]) -> None: - task.log.debug(f"Setting task policy for Dag: {task.dag_id} Task: {task.task_id}") - # task.add_inlets(["auto"]) - # task.pre_execute = _wrap_pre_execution(task.pre_execute) - - # MappedOperator's callbacks don't have setters until Airflow 2.X.X - # https://github.com/apache/airflow/issues/24547 - # We can bypass this by going through partial_kwargs for now - if MappedOperator and isinstance(task, MappedOperator): # type: ignore - on_failure_callback_prop: property = getattr( - MappedOperator, TASK_ON_FAILURE_CALLBACK - ) - on_success_callback_prop: property = getattr( - MappedOperator, TASK_ON_SUCCESS_CALLBACK - ) - if not on_failure_callback_prop.fset or not on_success_callback_prop.fset: - task.log.debug( - "Using MappedOperator's partial_kwargs instead of callback properties" - ) - task.partial_kwargs[TASK_ON_FAILURE_CALLBACK] = _wrap_on_failure_callback( - task.on_failure_callback + if _USE_AIRFLOW_LISTENER_INTERFACE: + if HAS_AIRFLOW_DAG_LISTENER_API: + from datahub_airflow_plugin.datahub_listener import ( # type: ignore[misc] + get_airflow_plugin_listener, ) - task.partial_kwargs[TASK_ON_SUCCESS_CALLBACK] = _wrap_on_success_callback( - task.on_success_callback - ) - return - - task.on_failure_callback = _wrap_on_failure_callback(task.on_failure_callback) # type: ignore - task.on_success_callback = _wrap_on_success_callback(task.on_success_callback) # type: ignore - # task.pre_execute = _wrap_pre_execution(task.pre_execute) - - -def _wrap_task_policy(policy): - if policy and hasattr(policy, "_task_policy_patched_by"): - return policy - - def custom_task_policy(task): - policy(task) - task_policy(task) - - # Add a flag to the policy to indicate that we've patched it. - custom_task_policy._task_policy_patched_by = "datahub_plugin" # type: ignore[attr-defined] - return custom_task_policy + listeners: list = list(filter(None, [get_airflow_plugin_listener()])) -def _patch_policy(settings): - if hasattr(settings, "task_policy"): - datahub_task_policy = _wrap_task_policy(settings.task_policy) - settings.task_policy = datahub_task_policy + else: + # On Airflow < 2.5, we need the listener to be a module. + # This is just a quick shim layer to make that work. + # The DAG listener API was added at the same time as this method + # was fixed, so we're reusing the same check variable. + # + # Related Airflow change: https://github.com/apache/airflow/pull/27113. + import datahub_airflow_plugin._datahub_listener_module as _listener_module # type: ignore[misc] + listeners = [_listener_module] -def _patch_datahub_policy(): - with contextlib.suppress(ImportError): - import airflow_local_settings - _patch_policy(airflow_local_settings) - - from airflow.models.dagbag import settings - - _patch_policy(settings) - - -_patch_datahub_policy() - - -class DatahubPlugin(AirflowPlugin): - name = "datahub_plugin" +if not _USE_AIRFLOW_LISTENER_INTERFACE: + # Use the policy patcher mechanism on Airflow 2.2 and below. + import datahub_airflow_plugin.datahub_plugin_v22 # noqa: F401 diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py new file mode 100644 index 00000000000000..f9a2119f51e329 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/datahub_plugin_v22.py @@ -0,0 +1,370 @@ +import contextlib +import logging +import traceback +from typing import Any, Callable, Iterable, List, Optional, Union + +import airflow +from airflow.lineage import PIPELINE_OUTLETS +from airflow.models.baseoperator import BaseOperator +from airflow.utils.module_loading import import_string +from cattr import structure +from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult +from datahub.telemetry import telemetry + +from datahub_airflow_plugin._airflow_shims import ( + MappedOperator, + get_task_inlets, + get_task_outlets, +) +from datahub_airflow_plugin._config import get_lineage_config +from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator +from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook +from datahub_airflow_plugin.lineage.datahub import DatahubLineageConfig + +TASK_ON_FAILURE_CALLBACK = "on_failure_callback" +TASK_ON_SUCCESS_CALLBACK = "on_success_callback" +TASK_ON_RETRY_CALLBACK = "on_retry_callback" + + +def get_task_inlets_advanced(task: BaseOperator, context: Any) -> Iterable[Any]: + # TODO: Fix for https://github.com/apache/airflow/commit/1b1f3fabc5909a447a6277cafef3a0d4ef1f01ae + # in Airflow 2.4. + # TODO: ignore/handle airflow's dataset type in our lineage + + inlets: List[Any] = [] + task_inlets = get_task_inlets(task) + # From Airflow 2.3 this should be AbstractOperator but due to compatibility reason lets use BaseOperator + if isinstance(task_inlets, (str, BaseOperator)): + inlets = [ + task_inlets, + ] + + if task_inlets and isinstance(task_inlets, list): + inlets = [] + task_ids = ( + {o for o in task_inlets if isinstance(o, str)} + .union(op.task_id for op in task_inlets if isinstance(op, BaseOperator)) + .intersection(task.get_flat_relative_ids(upstream=True)) + ) + + from airflow.lineage import AUTO + + # pick up unique direct upstream task_ids if AUTO is specified + if AUTO.upper() in task_inlets or AUTO.lower() in task_inlets: + print("Picking up unique direct upstream task_ids as AUTO is specified") + task_ids = task_ids.union( + task_ids.symmetric_difference(task.upstream_task_ids) + ) + + inlets = task.xcom_pull( + context, task_ids=list(task_ids), dag_id=task.dag_id, key=PIPELINE_OUTLETS + ) + + # re-instantiate the obtained inlets + inlets = [ + structure(item["data"], import_string(item["type_name"])) + # _get_instance(structure(item, Metadata)) + for sublist in inlets + if sublist + for item in sublist + ] + + for inlet in task_inlets: + if not isinstance(inlet, str): + inlets.append(inlet) + + return inlets + + +def _make_emit_callback( + logger: logging.Logger, +) -> Callable[[Optional[Exception], str], None]: + def emit_callback(err: Optional[Exception], msg: str) -> None: + if err: + logger.error(f"Error sending metadata to datahub: {msg}", exc_info=err) + + return emit_callback + + +def datahub_task_status_callback(context, status): + ti = context["ti"] + task: "BaseOperator" = ti.task + dag = context["dag"] + config: DatahubLineageConfig = context["_datahub_config"] + + # This code is from the original airflow lineage code -> + # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py + inlets = get_task_inlets_advanced(task, context) + + emitter = ( + DatahubGenericHook(config.datahub_conn_id).get_underlying_hook().make_emitter() + ) + + dataflow = AirflowGenerator.generate_dataflow( + cluster=config.cluster, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + task.log.info(f"Emitting Datahub Dataflow: {dataflow}") + dataflow.emit(emitter, callback=_make_emit_callback(task.log)) + + datajob = AirflowGenerator.generate_datajob( + cluster=config.cluster, + task=task, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + + for inlet in inlets: + datajob.inlets.append(inlet.urn) + + task_outlets = get_task_outlets(task) + for outlet in task_outlets: + datajob.outlets.append(outlet.urn) + + task.log.info(f"Emitting Datahub Datajob: {datajob}") + datajob.emit(emitter, callback=_make_emit_callback(task.log)) + + if config.capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag=dag, + dag_run=context["dag_run"], + datajob=datajob, + start_timestamp_millis=int(ti.start_date.timestamp() * 1000), + ) + + task.log.info(f"Emitted Start Datahub Dataprocess Instance: {dpi}") + + dpi = AirflowGenerator.complete_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag_run=context["dag_run"], + result=status, + dag=dag, + datajob=datajob, + end_timestamp_millis=int(ti.end_date.timestamp() * 1000), + ) + task.log.info(f"Emitted Completed Data Process Instance: {dpi}") + + emitter.flush() + + +def datahub_pre_execution(context): + ti = context["ti"] + task: "BaseOperator" = ti.task + dag = context["dag"] + config: DatahubLineageConfig = context["_datahub_config"] + + task.log.info("Running Datahub pre_execute method") + + emitter = ( + DatahubGenericHook(config.datahub_conn_id).get_underlying_hook().make_emitter() + ) + + # This code is from the original airflow lineage code -> + # https://github.com/apache/airflow/blob/main/airflow/lineage/__init__.py + inlets = get_task_inlets_advanced(task, context) + + datajob = AirflowGenerator.generate_datajob( + cluster=config.cluster, + task=ti.task, + dag=dag, + capture_tags=config.capture_tags_info, + capture_owner=config.capture_ownership_info, + ) + + for inlet in inlets: + datajob.inlets.append(inlet.urn) + + task_outlets = get_task_outlets(task) + + for outlet in task_outlets: + datajob.outlets.append(outlet.urn) + + task.log.info(f"Emitting Datahub dataJob {datajob}") + datajob.emit(emitter, callback=_make_emit_callback(task.log)) + + if config.capture_executions: + dpi = AirflowGenerator.run_datajob( + emitter=emitter, + cluster=config.cluster, + ti=ti, + dag=dag, + dag_run=context["dag_run"], + datajob=datajob, + start_timestamp_millis=int(ti.start_date.timestamp() * 1000), + ) + + task.log.info(f"Emitting Datahub Dataprocess Instance: {dpi}") + + emitter.flush() + + +def _wrap_pre_execution(pre_execution): + def custom_pre_execution(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + datahub_pre_execution(context) + + # Call original policy + if pre_execution: + pre_execution(context) + + return custom_pre_execution + + +def _wrap_on_failure_callback(on_failure_callback): + def custom_on_failure_callback(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + try: + datahub_task_status_callback(context, status=InstanceRunResult.FAILURE) + except Exception as e: + if not config.graceful_exceptions: + raise e + else: + print(f"Exception: {traceback.format_exc()}") + + # Call original policy + if on_failure_callback: + on_failure_callback(context) + + return custom_on_failure_callback + + +def _wrap_on_success_callback(on_success_callback): + def custom_on_success_callback(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + try: + datahub_task_status_callback(context, status=InstanceRunResult.SUCCESS) + except Exception as e: + if not config.graceful_exceptions: + raise e + else: + print(f"Exception: {traceback.format_exc()}") + + # Call original policy + if on_success_callback: + on_success_callback(context) + + return custom_on_success_callback + + +def _wrap_on_retry_callback(on_retry_callback): + def custom_on_retry_callback(context): + config = get_lineage_config() + if config.enabled: + context["_datahub_config"] = config + try: + datahub_task_status_callback( + context, status=InstanceRunResult.UP_FOR_RETRY + ) + except Exception as e: + if not config.graceful_exceptions: + raise e + else: + print(f"Exception: {traceback.format_exc()}") + + # Call original policy + if on_retry_callback: + on_retry_callback(context) + + return custom_on_retry_callback + + +def task_policy(task: Union[BaseOperator, MappedOperator]) -> None: + task.log.debug(f"Setting task policy for Dag: {task.dag_id} Task: {task.task_id}") + # task.add_inlets(["auto"]) + # task.pre_execute = _wrap_pre_execution(task.pre_execute) + + # MappedOperator's callbacks don't have setters until Airflow 2.X.X + # https://github.com/apache/airflow/issues/24547 + # We can bypass this by going through partial_kwargs for now + if MappedOperator and isinstance(task, MappedOperator): # type: ignore + on_failure_callback_prop: property = getattr( + MappedOperator, TASK_ON_FAILURE_CALLBACK + ) + on_success_callback_prop: property = getattr( + MappedOperator, TASK_ON_SUCCESS_CALLBACK + ) + on_retry_callback_prop: property = getattr( + MappedOperator, TASK_ON_RETRY_CALLBACK + ) + if ( + not on_failure_callback_prop.fset + or not on_success_callback_prop.fset + or not on_retry_callback_prop.fset + ): + task.log.debug( + "Using MappedOperator's partial_kwargs instead of callback properties" + ) + task.partial_kwargs[TASK_ON_FAILURE_CALLBACK] = _wrap_on_failure_callback( + task.on_failure_callback + ) + task.partial_kwargs[TASK_ON_SUCCESS_CALLBACK] = _wrap_on_success_callback( + task.on_success_callback + ) + task.partial_kwargs[TASK_ON_RETRY_CALLBACK] = _wrap_on_retry_callback( + task.on_retry_callback + ) + return + + task.on_failure_callback = _wrap_on_failure_callback(task.on_failure_callback) # type: ignore + task.on_success_callback = _wrap_on_success_callback(task.on_success_callback) # type: ignore + task.on_retry_callback = _wrap_on_retry_callback(task.on_retry_callback) # type: ignore + # task.pre_execute = _wrap_pre_execution(task.pre_execute) + + +def _wrap_task_policy(policy): + if policy and hasattr(policy, "_task_policy_patched_by"): + return policy + + def custom_task_policy(task): + policy(task) + task_policy(task) + + # Add a flag to the policy to indicate that we've patched it. + custom_task_policy._task_policy_patched_by = "datahub_plugin" # type: ignore[attr-defined] + return custom_task_policy + + +def _patch_policy(settings): + if hasattr(settings, "task_policy"): + datahub_task_policy = _wrap_task_policy(settings.task_policy) + settings.task_policy = datahub_task_policy + + +def _patch_datahub_policy(): + with contextlib.suppress(ImportError): + import airflow_local_settings + + _patch_policy(airflow_local_settings) + + from airflow.models.dagbag import settings + + _patch_policy(settings) + + plugin_config = get_lineage_config() + telemetry.telemetry_instance.ping( + "airflow-plugin-init", + { + "airflow-version": airflow.__version__, + "datahub-airflow-plugin": "v1", + "capture_executions": plugin_config.capture_executions, + "capture_tags": plugin_config.capture_tags_info, + "capture_ownership": plugin_config.capture_ownership_info, + }, + ) + + +_patch_datahub_policy() diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py index f40295c6bb883a..0d7cdb6b6e90a5 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/example_dags/lineage_emission_dag.py @@ -2,12 +2,11 @@ This example demonstrates how to emit lineage to DataHub within an Airflow DAG. """ - from datetime import timedelta import datahub.emitter.mce_builder as builder from airflow import DAG -from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator +from airflow.operators.bash import BashOperator from airflow.utils.dates import days_ago from datahub_airflow_plugin.operators.datahub import DatahubEmitterOperator @@ -33,23 +32,10 @@ catchup=False, default_view="tree", ) as dag: - # This example shows a SnowflakeOperator followed by a lineage emission. However, the - # same DatahubEmitterOperator can be used to emit lineage in any context. - - sql = """CREATE OR REPLACE TABLE `mydb.schema.tableC` AS - WITH some_table AS ( - SELECT * FROM `mydb.schema.tableA` - ), - some_other_table AS ( - SELECT id, some_column FROM `mydb.schema.tableB` - ) - SELECT * FROM some_table - LEFT JOIN some_other_table ON some_table.unique_id=some_other_table.id""" - transformation_task = SnowflakeOperator( - task_id="snowflake_transformation", + transformation_task = BashOperator( + task_id="transformation_task", dag=dag, - snowflake_conn_id="snowflake_default", - sql=sql, + bash_command="echo 'This is where you might run your data tooling.'", ) emit_lineage_task = DatahubEmitterOperator( diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py index 8fb7363f8cad1a..9604931795ccb9 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/hooks/datahub.py @@ -1,7 +1,9 @@ -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, Sequence, Tuple, Union from airflow.exceptions import AirflowException from airflow.hooks.base import BaseHook +from datahub.emitter.generic_emitter import Emitter +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( MetadataChangeEvent, MetadataChangeProposal, @@ -11,6 +13,7 @@ from airflow.models.connection import Connection from datahub.emitter.kafka_emitter import DatahubKafkaEmitter from datahub.emitter.rest_emitter import DatahubRestEmitter + from datahub.emitter.synchronized_file_emitter import SynchronizedFileEmitter from datahub.ingestion.sink.datahub_kafka import KafkaSinkConfig @@ -80,17 +83,24 @@ def make_emitter(self) -> "DatahubRestEmitter": return datahub.emitter.rest_emitter.DatahubRestEmitter(*self._get_config()) - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + def emit( + self, + items: Sequence[ + Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ] + ], + ) -> None: emitter = self.make_emitter() - for mce in mces: - emitter.emit_mce(mce) + for item in items: + emitter.emit(item) - def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: - emitter = self.make_emitter() - - for mce in mcps: - emitter.emit_mcp(mce) + # Retained for backwards compatibility. + emit_mces = emit + emit_mcps = emit class DatahubKafkaHook(BaseHook): @@ -152,7 +162,16 @@ def make_emitter(self) -> "DatahubKafkaEmitter": sink_config = self._get_config() return datahub.emitter.kafka_emitter.DatahubKafkaEmitter(sink_config) - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: + def emit( + self, + items: Sequence[ + Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ] + ], + ) -> None: emitter = self.make_emitter() errors = [] @@ -160,29 +179,50 @@ def callback(exc, msg): if exc: errors.append(exc) - for mce in mces: - emitter.emit_mce_async(mce, callback) + for mce in items: + emitter.emit(mce, callback) emitter.flush() if errors: - raise AirflowException(f"failed to push some MCEs: {errors}") + raise AirflowException(f"failed to push some metadata: {errors}") - def emit_mcps(self, mcps: List[MetadataChangeProposal]) -> None: - emitter = self.make_emitter() - errors = [] + # Retained for backwards compatibility. + emit_mces = emit + emit_mcps = emit - def callback(exc, msg): - if exc: - errors.append(exc) - for mcp in mcps: - emitter.emit_mcp_async(mcp, callback) +class SynchronizedFileHook(BaseHook): + conn_type = "datahub-file" - emitter.flush() + def __init__(self, datahub_conn_id: str) -> None: + super().__init__() + self.datahub_conn_id = datahub_conn_id - if errors: - raise AirflowException(f"failed to push some MCPs: {errors}") + def make_emitter(self) -> "SynchronizedFileEmitter": + from datahub.emitter.synchronized_file_emitter import SynchronizedFileEmitter + + conn = self.get_connection(self.datahub_conn_id) + filename = conn.host + if not filename: + raise AirflowException("filename parameter is required") + + return SynchronizedFileEmitter(filename=filename) + + def emit( + self, + items: Sequence[ + Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ] + ], + ) -> None: + emitter = self.make_emitter() + + for item in items: + emitter.emit(item) class DatahubGenericHook(BaseHook): @@ -198,7 +238,9 @@ def __init__(self, datahub_conn_id: str) -> None: super().__init__() self.datahub_conn_id = datahub_conn_id - def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: + def get_underlying_hook( + self, + ) -> Union[DatahubRestHook, DatahubKafkaHook, SynchronizedFileHook]: conn = self.get_connection(self.datahub_conn_id) # We need to figure out the underlying hook type. First check the @@ -213,6 +255,11 @@ def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: or conn.conn_type == DatahubKafkaHook.conn_type.replace("-", "_") ): return DatahubKafkaHook(self.datahub_conn_id) + elif ( + conn.conn_type == SynchronizedFileHook.conn_type + or conn.conn_type == SynchronizedFileHook.conn_type.replace("-", "_") + ): + return SynchronizedFileHook(self.datahub_conn_id) elif "rest" in self.datahub_conn_id: return DatahubRestHook(self.datahub_conn_id) elif "kafka" in self.datahub_conn_id: @@ -222,8 +269,20 @@ def get_underlying_hook(self) -> Union[DatahubRestHook, DatahubKafkaHook]: f"DataHub cannot handle conn_type {conn.conn_type} in {conn}" ) - def make_emitter(self) -> Union["DatahubRestEmitter", "DatahubKafkaEmitter"]: + def make_emitter(self) -> Emitter: return self.get_underlying_hook().make_emitter() - def emit_mces(self, mces: List[MetadataChangeEvent]) -> None: - return self.get_underlying_hook().emit_mces(mces) + def emit( + self, + items: Sequence[ + Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ] + ], + ) -> None: + return self.get_underlying_hook().emit(items) + + # Retained for backwards compatibility. + emit_mces = emit diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py similarity index 72% rename from metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py rename to metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py index d91c039ffa718d..f5f519fa23b11e 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/_lineage_core.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/_lineage_core.py @@ -1,11 +1,10 @@ from datetime import datetime from typing import TYPE_CHECKING, Dict, List -import datahub.emitter.mce_builder as builder from datahub.api.entities.dataprocess.dataprocess_instance import InstanceRunResult -from datahub.configuration.common import ConfigModel from datahub.utilities.urns.dataset_urn import DatasetUrn +from datahub_airflow_plugin._config import DatahubLineageConfig from datahub_airflow_plugin.client.airflow_generator import AirflowGenerator from datahub_airflow_plugin.entities import _Entity @@ -15,39 +14,14 @@ from airflow.models.taskinstance import TaskInstance from datahub_airflow_plugin._airflow_shims import Operator - from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook def _entities_to_urn_list(iolets: List[_Entity]) -> List[DatasetUrn]: return [DatasetUrn.create_from_string(let.urn) for let in iolets] -class DatahubBasicLineageConfig(ConfigModel): - enabled: bool = True - - # DataHub hook connection ID. - datahub_conn_id: str - - # Cluster to associate with the pipelines and tasks. Defaults to "prod". - cluster: str = builder.DEFAULT_FLOW_CLUSTER - - # If true, the owners field of the DAG will be capture as a DataHub corpuser. - capture_ownership_info: bool = True - - # If true, the tags field of the DAG will be captured as DataHub tags. - capture_tags_info: bool = True - - capture_executions: bool = False - - def make_emitter_hook(self) -> "DatahubGenericHook": - # This is necessary to avoid issues with circular imports. - from datahub_airflow_plugin.hooks.datahub import DatahubGenericHook - - return DatahubGenericHook(self.datahub_conn_id) - - def send_lineage_to_datahub( - config: DatahubBasicLineageConfig, + config: DatahubLineageConfig, operator: "Operator", inlets: List[_Entity], outlets: List[_Entity], diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py index c41bb2b2a1e371..3ebe7831d08f9a 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/lineage/datahub.py @@ -4,8 +4,8 @@ from airflow.configuration import conf from airflow.lineage.backend import LineageBackend -from datahub_airflow_plugin._lineage_core import ( - DatahubBasicLineageConfig, +from datahub_airflow_plugin.lineage._lineage_core import ( + DatahubLineageConfig, send_lineage_to_datahub, ) @@ -13,14 +13,7 @@ from airflow.models.baseoperator import BaseOperator -class DatahubLineageConfig(DatahubBasicLineageConfig): - # If set to true, most runtime errors in the lineage backend will be - # suppressed and will not cause the overall task to fail. Note that - # configuration issues will still throw exceptions. - graceful_exceptions: bool = True - - -def get_lineage_config() -> DatahubLineageConfig: +def get_lineage_backend_config() -> DatahubLineageConfig: """Load the lineage config from airflow.cfg.""" # The kwargs pattern is also used for secret backends. @@ -51,8 +44,7 @@ class DatahubLineageBackend(LineageBackend): datahub_kwargs = { "datahub_conn_id": "datahub_rest_default", "capture_ownership_info": true, - "capture_tags_info": true, - "graceful_exceptions": true } + "capture_tags_info": true } # The above indentation is important! """ @@ -61,7 +53,7 @@ def __init__(self) -> None: # By attempting to get and parse the config, we can detect configuration errors # ahead of time. The init method is only called in Airflow 2.x. - _ = get_lineage_config() + _ = get_lineage_backend_config() # With Airflow 2.0, this can be an instance method. However, with Airflow 1.10.x, this # method is used statically, even though LineageBackend declares it as an instance variable. @@ -72,7 +64,7 @@ def send_lineage( outlets: Optional[List] = None, # unused context: Optional[Dict] = None, ) -> None: - config = get_lineage_config() + config = get_lineage_backend_config() if not config.enabled: return @@ -82,10 +74,4 @@ def send_lineage( config, operator, operator.inlets, operator.outlets, context ) except Exception as e: - if config.graceful_exceptions: - operator.log.error(e) - operator.log.info( - "Suppressing error because graceful_exceptions is set" - ) - else: - raise + operator.log.error(e) diff --git a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py index 109e7ddfe4dfa2..15b50c51a561da 100644 --- a/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py +++ b/metadata-ingestion-modules/airflow-plugin/src/datahub_airflow_plugin/operators/datahub.py @@ -57,7 +57,7 @@ def __init__( # type: ignore[no-untyped-def] datahub_conn_id=datahub_conn_id, **kwargs, ) - self.mces = mces + self.metadata = mces def execute(self, context): - self.generic_hook.get_underlying_hook().emit_mces(self.mces) + self.generic_hook.get_underlying_hook().emit(self.metadata) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/conftest.py b/metadata-ingestion-modules/airflow-plugin/tests/conftest.py new file mode 100644 index 00000000000000..d2c45e723f1b00 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/conftest.py @@ -0,0 +1,6 @@ +def pytest_addoption(parser): + parser.addoption( + "--update-golden-files", + action="store_true", + default=False, + ) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py new file mode 100644 index 00000000000000..8b0803ab98422b --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py @@ -0,0 +1,34 @@ +from datetime import datetime + +from airflow import DAG +from airflow.operators.bash import BashOperator + +from datahub_airflow_plugin.entities import Dataset, Urn + +with DAG( + "basic_iolets", + start_date=datetime(2023, 1, 1), + schedule_interval=None, + catchup=False, +) as dag: + task = BashOperator( + task_id="run_data_task", + dag=dag, + bash_command="echo 'This is where you might run your data tooling.'", + inlets=[ + Dataset(platform="snowflake", name="mydb.schema.tableA"), + Dataset(platform="snowflake", name="mydb.schema.tableB", env="DEV"), + Dataset( + platform="snowflake", + name="mydb.schema.tableC", + platform_instance="cloud", + ), + Urn( + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ), + ], + outlets=[ + Dataset("snowflake", "mydb.schema.tableD"), + Dataset("snowflake", "mydb.schema.tableE"), + ], + ) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py new file mode 100644 index 00000000000000..1dd047f0a6dccb --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py @@ -0,0 +1,34 @@ +from datetime import datetime + +from airflow import DAG +from airflow.operators.bash import BashOperator + +from datahub_airflow_plugin.entities import Dataset, Urn + +with DAG( + "simple_dag", + start_date=datetime(2023, 1, 1), + schedule_interval=None, + catchup=False, + description="A simple DAG that runs a few fake data tasks.", +) as dag: + task1 = BashOperator( + task_id="task_1", + dag=dag, + bash_command="echo 'task 1'", + inlets=[ + Dataset(platform="snowflake", name="mydb.schema.tableA"), + Urn( + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ), + ], + outlets=[Dataset("snowflake", "mydb.schema.tableD")], + ) + + task2 = BashOperator( + task_id="run_another_data_task", + dag=dag, + bash_command="echo 'task 2'", + ) + + task1 >> task2 diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py new file mode 100644 index 00000000000000..347d0f88b0cd01 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py @@ -0,0 +1,32 @@ +from datetime import datetime + +from airflow import DAG +from airflow.providers.snowflake.operators.snowflake import SnowflakeOperator + +SNOWFLAKE_COST_TABLE = "costs" +SNOWFLAKE_PROCESSED_TABLE = "processed_costs" + +with DAG( + "snowflake_operator", + start_date=datetime(2023, 1, 1), + schedule_interval=None, + catchup=False, +) as dag: + transform_cost_table = SnowflakeOperator( + snowflake_conn_id="my_snowflake", + task_id="transform_cost_table", + sql=""" + CREATE OR REPLACE TABLE {{ params.out_table_name }} AS + SELECT + id, + month, + total_cost, + area, + total_cost / area as cost_per_area + FROM {{ params.in_table_name }} + """, + params={ + "in_table_name": SNOWFLAKE_COST_TABLE, + "out_table_name": SNOWFLAKE_PROCESSED_TABLE, + }, + ) diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py new file mode 100644 index 00000000000000..77faec3c8935ae --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py @@ -0,0 +1,75 @@ +from datetime import datetime + +from airflow import DAG +from airflow.providers.sqlite.operators.sqlite import SqliteOperator + +CONN_ID = "my_sqlite" + +COST_TABLE = "costs" +PROCESSED_TABLE = "processed_costs" + +with DAG( + "sqlite_operator", + start_date=datetime(2023, 1, 1), + schedule_interval=None, + catchup=False, +) as dag: + create_cost_table = SqliteOperator( + sqlite_conn_id=CONN_ID, + task_id="create_cost_table", + sql=""" + CREATE TABLE IF NOT EXISTS {{ params.table_name }} ( + id INTEGER PRIMARY KEY, + month TEXT NOT NULL, + total_cost REAL NOT NULL, + area REAL NOT NULL + ) + """, + params={"table_name": COST_TABLE}, + ) + + populate_cost_table = SqliteOperator( + sqlite_conn_id=CONN_ID, + task_id="populate_cost_table", + sql=""" + INSERT INTO {{ params.table_name }} (id, month, total_cost, area) + VALUES + (1, '2021-01', 100, 10), + (2, '2021-02', 200, 20), + (3, '2021-03', 300, 30) + """, + params={"table_name": COST_TABLE}, + ) + + transform_cost_table = SqliteOperator( + sqlite_conn_id=CONN_ID, + task_id="transform_cost_table", + sql=""" + CREATE TABLE IF NOT EXISTS {{ params.out_table_name }} AS + SELECT + id, + month, + total_cost, + area, + total_cost / area as cost_per_area + FROM {{ params.in_table_name }} + """, + params={ + "in_table_name": COST_TABLE, + "out_table_name": PROCESSED_TABLE, + }, + ) + + cleanup_tables = [] + for table_name in [COST_TABLE, PROCESSED_TABLE]: + cleanup_table = SqliteOperator( + sqlite_conn_id=CONN_ID, + task_id=f"cleanup_{table_name}", + sql=""" + DROP TABLE {{ params.table_name }} + """, + params={"table_name": table_name}, + ) + cleanup_tables.append(cleanup_table) + + create_cost_table >> populate_cost_table >> transform_cost_table >> cleanup_tables diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json new file mode 100644 index 00000000000000..a4c17c73e9c7e6 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_basic_iolets.json @@ -0,0 +1,538 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "None", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=basic_iolets", + "name": "basic_iolets" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "0.176536", + "start_date": "2023-09-30 00:49:56.670239+00:00", + "end_date": "2023-09-30 00:49:56.846775+00:00", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "1", + "max_tries": "0", + "external_executor_id": "None", + "state": "success", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets", + "orchestrator": "airflow", + "dag_id": "basic_iolets", + "task_id": "run_data_task" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets", + "name": "basic_iolets_run_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696034996670, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034996670, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 2 + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034996846, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json new file mode 100644 index 00000000000000..a0a95716a09931 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v1_simple_dag.json @@ -0,0 +1,728 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "'A simple DAG that runs a few fake data tasks.'", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "None", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "0.175983", + "start_date": "2023-09-30 00:48:58.943850+00:00", + "end_date": "2023-09-30 00:48:59.119833+00:00", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "1", + "max_tries": "0", + "external_executor_id": "None", + "state": "success", + "operator": "BashOperator", + "priority_weight": "2", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag", + "orchestrator": "airflow", + "dag_id": "simple_dag", + "task_id": "task_1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag", + "name": "simple_dag_task_1_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696034938943, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034938943, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 2 + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034939119, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "'A simple DAG that runs a few fake data tasks.'", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "None", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "'all_success'", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "0.129888", + "start_date": "2023-09-30 00:49:02.158752+00:00", + "end_date": "2023-09-30 00:49:02.288640+00:00", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "1", + "max_tries": "0", + "external_executor_id": "None", + "state": "success", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag", + "orchestrator": "airflow", + "dag_id": "simple_dag", + "task_id": "run_another_data_task" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag", + "name": "simple_dag_run_another_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696034942158, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034942158, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 2 + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696034942288, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json new file mode 100644 index 00000000000000..1974f1f085df0b --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets.json @@ -0,0 +1,540 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=basic_iolets", + "name": "basic_iolets" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 01:13:14.266272+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets&map_index=-1", + "orchestrator": "airflow", + "dag_id": "basic_iolets", + "task_id": "run_data_task" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets&map_index=-1", + "name": "basic_iolets_run_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696036394266, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696036394266, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696036394833, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json new file mode 100644 index 00000000000000..d02951bc9e82dd --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_basic_iolets_no_dag_listener.json @@ -0,0 +1,540 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/basic_iolets.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=basic_iolets", + "name": "basic_iolets" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,basic_iolets,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:59:52.401211+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets&map_index=-1", + "orchestrator": "airflow", + "dag_id": "basic_iolets", + "task_id": "run_data_task" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_data_task&dag_id=basic_iolets&map_index=-1", + "name": "basic_iolets_run_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057192401, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057192401, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableB', env='DEV', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableC', env='PROD', platform_instance='cloud'), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None), Dataset(platform='snowflake', name='mydb.schema.tableE', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"task_id\": \"run_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'This is where you might run your data tooling.'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"env\": \"DEV\", \"name\": \"mydb.schema.tableB\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableC\", \"platform\": \"snowflake\", \"platform_instance\": \"cloud\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}, {\"env\": \"PROD\", \"name\": \"mydb.schema.tableE\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=basic_iolets&_flt_3_task_id=run_data_task", + "name": "run_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,cloud.mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableB,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableE,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,basic_iolets,prod),run_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:5d666eaf9015a31b3e305e8bc2dba078", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057192982, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json new file mode 100644 index 00000000000000..9acc47ec1321e5 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag.json @@ -0,0 +1,674 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "'A simple DAG that runs a few fake data tasks.'", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:53:58.219003+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "2", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag&map_index=-1", + "orchestrator": "airflow", + "dag_id": "simple_dag", + "task_id": "task_1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag&map_index=-1", + "name": "simple_dag_task_1_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056838219, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056838219, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056838648, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:54:02.407515+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag&map_index=-1", + "orchestrator": "airflow", + "dag_id": "simple_dag", + "task_id": "run_another_data_task" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag&map_index=-1", + "name": "simple_dag_run_another_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056842407, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056842407, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_lock_for_execution\": true, \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_on_exit_code\": [99], \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"wait_for_past_depends_before_skipping\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056842831, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json new file mode 100644 index 00000000000000..03299c483f57fd --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_simple_dag_no_dag_listener.json @@ -0,0 +1,732 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "'A simple DAG that runs a few fake data tasks.'", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:58:56.105026+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "2", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag&map_index=-1", + "orchestrator": "airflow", + "dag_id": "simple_dag", + "task_id": "task_1" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=task_1&dag_id=simple_dag&map_index=-1", + "name": "simple_dag_task_1_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057136105, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057136105, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'task_1'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'task_1'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['run_another_data_task']", + "inlets": "[Dataset(platform='snowflake', name='mydb.schema.tableA', env='PROD', platform_instance=None), Urn(_urn='urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)')]", + "outlets": "[Dataset(platform='snowflake', name='mydb.schema.tableD', env='PROD', platform_instance=None)]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 1'\", \"dag\": \"<>\", \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"task_id\": \"task_1\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 1'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [\"run_another_data_task\"], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableA\", \"platform\": \"snowflake\"}, {\"_urn\": \"urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)\"}], \"outlets\": [{\"env\": \"PROD\", \"name\": \"mydb.schema.tableD\", \"platform\": \"snowflake\"}], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"task_1\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=task_1", + "name": "task_1", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableA,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableC,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,mydb.schema.tableD,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fdbbbcd638bc0e91bbf8d7775efbecaf", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057136612, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "'A simple DAG that runs a few fake data tasks.'", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/simple_dag.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=simple_dag", + "name": "simple_dag", + "description": "A simple DAG that runs a few fake data tasks." + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,simple_dag,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:58:59.567004+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "BashOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag&map_index=-1", + "orchestrator": "airflow", + "dag_id": "simple_dag", + "task_id": "run_another_data_task" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=run_another_data_task&dag_id=simple_dag&map_index=-1", + "name": "simple_dag_run_another_data_task_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696057139567, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057139567, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'run_another_data_task'", + "execution_timeout": "None", + "sla": "None", + "task_id": "'run_another_data_task'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_run_facet_unknownSourceAttribute": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"unknownItems\": [{\"name\": \"BashOperator\", \"properties\": {\"_BaseOperator__from_mapped\": false, \"_BaseOperator__init_kwargs\": {\"bash_command\": \"echo 'task 2'\", \"dag\": \"<>\", \"task_id\": \"run_another_data_task\"}, \"_BaseOperator__instantiated\": true, \"_dag\": \"<>\", \"_log\": \"<>\", \"append_env\": false, \"bash_command\": \"echo 'task 2'\", \"depends_on_past\": false, \"do_xcom_push\": true, \"downstream_task_ids\": [], \"email_on_failure\": true, \"email_on_retry\": true, \"executor_config\": {}, \"ignore_first_depends_on_past\": true, \"inlets\": [], \"outlets\": [], \"output_encoding\": \"utf-8\", \"owner\": \"airflow\", \"params\": \"<>\", \"pool\": \"default_pool\", \"pool_slots\": 1, \"priority_weight\": 1, \"queue\": \"default\", \"retries\": 0, \"retry_delay\": \"<>\", \"retry_exponential_backoff\": false, \"skip_exit_code\": 99, \"start_date\": \"<>\", \"task_group\": \"<>\", \"task_id\": \"run_another_data_task\", \"trigger_rule\": \"all_success\", \"upstream_task_ids\": [\"task_1\"], \"wait_for_downstream\": false, \"weight_rule\": \"downstream\"}, \"type\": \"operator\"}]}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=simple_dag&_flt_3_task_id=run_another_data_task", + "name": "run_another_data_task", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),task_1)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,simple_dag,prod),run_another_data_task)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:888f71b79d9a0b162fe44acad7b2c2ae", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696057140164, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json new file mode 100644 index 00000000000000..11a0b17b45b95c --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_snowflake_operator.json @@ -0,0 +1,512 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,snowflake_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/snowflake_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=snowflake_operator", + "name": "snowflake_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,snowflake_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,snowflake_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE OR REPLACE TABLE processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE OR REPLACE TABLE processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=snowflake_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-09-30 06:55:36.844976+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SnowflakeOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=snowflake_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "snowflake_operator", + "task_id": "transform_cost_table" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=snowflake_operator&map_index=-1", + "name": "snowflake_operator_transform_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1696056936844, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056936844, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE OR REPLACE TABLE processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE OR REPLACE TABLE processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=snowflake_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,datahub_test_database.datahub_test_schema.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,snowflake_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:3161034cc84e16a7c5e1906225734747", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1696056938096, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "FAILURE", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json new file mode 100644 index 00000000000000..19e4aac9fb95e1 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator.json @@ -0,0 +1,1858 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'create_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n '", + "task_id": "'create_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['populate_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", + "name": "create_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:29:10.262813+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "5", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=create_cost_table&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "create_cost_table" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=create_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_create_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401750262, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401750262, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'create_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n '", + "task_id": "'create_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['populate_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", + "name": "create_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401750651, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'populate_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "\"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"", + "task_id": "'populate_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['transform_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=populate_cost_table", + "name": "populate_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:29:15.013834+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "4", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=populate_cost_table&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "populate_cost_table" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=populate_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_populate_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401755013, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401755013, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'populate_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "\"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"", + "task_id": "'populate_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['transform_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=populate_cost_table", + "name": "populate_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401755600, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['cleanup_costs', 'cleanup_processed_costs']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:29:20.216818+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "3", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "transform_cost_table" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_transform_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401760216, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401760216, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['cleanup_costs', 'cleanup_processed_costs']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401761237, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE costs\\n '", + "task_id": "'cleanup_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_costs", + "name": "cleanup_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:29:26.243934+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_costs&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "cleanup_costs" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_costs&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_cleanup_costs_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401766243, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401766243, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE costs\\n '", + "task_id": "'cleanup_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_costs", + "name": "cleanup_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401767373, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_processed_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE processed_costs\\n '", + "task_id": "'cleanup_processed_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE processed_costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_processed_costs", + "name": "cleanup_processed_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:29:32.075613+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_processed_costs&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "cleanup_processed_costs" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_processed_costs&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_cleanup_processed_costs_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401772075, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401772075, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_processed_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE processed_costs\\n '", + "task_id": "'cleanup_processed_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE processed_costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_processed_costs", + "name": "cleanup_processed_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401773454, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json new file mode 100644 index 00000000000000..b67464b385335c --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/goldens/v2_sqlite_operator_no_dag_listener.json @@ -0,0 +1,2086 @@ +[ +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'create_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n '", + "task_id": "'create_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['populate_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", + "name": "create_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:27:26.883178+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "5", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=create_cost_table&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "create_cost_table" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=create_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_create_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401646883, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401646883, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'create_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n '", + "task_id": "'create_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['populate_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS costs (\\n id INTEGER PRIMARY KEY,\\n month TEXT NOT NULL,\\n total_cost REAL NOT NULL,\\n area REAL NOT NULL\\n )\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=create_cost_table", + "name": "create_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "inputDatajobs": [], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:fbeed1180fa0434e02ac6f75ace87869", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401647826, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'populate_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "\"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"", + "task_id": "'populate_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['transform_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=populate_cost_table", + "name": "populate_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:27:31.398799+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "4", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=populate_cost_table&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "populate_cost_table" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=populate_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_populate_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401651398, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401651398, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'populate_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "\"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"", + "task_id": "'populate_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['transform_cost_table']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n INSERT INTO costs (id, month, total_cost, area)\\n VALUES\\n (1, '2021-01', 100, 10),\\n (2, '2021-02', 200, 20),\\n (3, '2021-03', 300, 30)\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=populate_cost_table", + "name": "populate_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),create_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:04e1badac1eacd1c41123d07f579fa92", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401652651, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['cleanup_costs', 'cleanup_processed_costs']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:27:37.697995+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "3", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "transform_cost_table" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=transform_cost_table&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_transform_cost_table_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401657697, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceOutput", + "aspect": { + "json": { + "outputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401657697, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'transform_cost_table'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n '", + "task_id": "'transform_cost_table'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "['cleanup_costs', 'cleanup_processed_costs']", + "inlets": "[]", + "outlets": "[]", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n CREATE TABLE IF NOT EXISTS processed_costs AS\\n SELECT\\n id,\\n month,\\n total_cost,\\n area,\\n total_cost / area as cost_per_area\\n FROM costs\\n \"}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=transform_cost_table", + "name": "transform_cost_table", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),populate_cost_table)" + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),month)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),month)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),total_cost)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),area)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),area)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD),total_cost)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD),cost_per_area)" + ], + "confidenceScore": 1.0 + } + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:64e5ff8f552e857b607832731e09808b", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401659496, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE costs\\n '", + "task_id": "'cleanup_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_costs", + "name": "cleanup_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:27:45.670215+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_costs&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "cleanup_costs" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_costs&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_cleanup_costs_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401665670, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401665670, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE costs\\n '", + "task_id": "'cleanup_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_costs", + "name": "cleanup_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:07285de22276959612189d51336cc21a", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401667670, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "dataFlowInfo", + "aspect": { + "json": { + "customProperties": { + "_access_control": "None", + "catchup": "False", + "description": "None", + "doc_md": "None", + "fileloc": "'/Users/hsheth/projects/datahub/metadata-ingestion-modules/airflow-plugin/tests/integration/dags/sqlite_operator.py'", + "is_paused_upon_creation": "None", + "start_date": "DateTime(2023, 1, 1, 0, 0, 0, tzinfo=Timezone('UTC'))", + "tags": "[]", + "timezone": "Timezone('UTC')" + }, + "externalUrl": "http://airflow.example.com/tree?dag_id=sqlite_operator", + "name": "sqlite_operator" + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(airflow,sqlite_operator,prod)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_processed_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE processed_costs\\n '", + "task_id": "'cleanup_processed_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE processed_costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_processed_costs", + "name": "cleanup_processed_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceProperties", + "aspect": { + "json": { + "customProperties": { + "run_id": "manual_run_test", + "duration": "None", + "start_date": "2023-10-15 20:27:51.559194+00:00", + "end_date": "None", + "execution_date": "2023-09-27 21:34:38+00:00", + "try_number": "0", + "max_tries": "0", + "external_executor_id": "None", + "state": "running", + "operator": "SqliteOperator", + "priority_weight": "1", + "log_url": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_processed_costs&dag_id=sqlite_operator&map_index=-1", + "orchestrator": "airflow", + "dag_id": "sqlite_operator", + "task_id": "cleanup_processed_costs" + }, + "externalUrl": "http://airflow.example.com/log?execution_date=2023-09-27T21%3A34%3A38%2B00%3A00&task_id=cleanup_processed_costs&dag_id=sqlite_operator&map_index=-1", + "name": "sqlite_operator_cleanup_processed_costs_manual_run_test", + "type": "BATCH_AD_HOC", + "created": { + "time": 1697401671559, + "actor": "urn:li:corpuser:datahub" + } + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRelationships", + "aspect": { + "json": { + "parentTemplate": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "upstreamInstances": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceInput", + "aspect": { + "json": { + "inputs": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401671559, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "STARTED", + "attempt": 1 + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInfo", + "aspect": { + "json": { + "customProperties": { + "depends_on_past": "False", + "email": "None", + "label": "'cleanup_processed_costs'", + "execution_timeout": "None", + "sla": "None", + "sql": "'\\n DROP TABLE processed_costs\\n '", + "task_id": "'cleanup_processed_costs'", + "trigger_rule": "", + "wait_for_downstream": "False", + "downstream_task_ids": "[]", + "inlets": "[]", + "outlets": "[]", + "datahub_sql_parser_error": "Can only generate column-level lineage for select-like inner statements, not (outer statement type: )", + "openlineage_job_facet_sql": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/SqlJobFacet\", \"query\": \"\\n DROP TABLE processed_costs\\n \"}", + "openlineage_run_facet_extractionError": "{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/ExtractionErrorRunFacet\", \"errors\": [{\"_producer\": \"https://github.com/OpenLineage/OpenLineage/tree/1.2.0/integration/airflow\", \"_schemaURL\": \"https://raw.githubusercontent.com/OpenLineage/OpenLineage/main/spec/OpenLineage.json#/definitions/BaseFacet\", \"errorMessage\": \"Can only generate column-level lineage for select-like inner statements, not (outer statement type: )\", \"task\": \"datahub_sql_parser\"}], \"failedTasks\": 1, \"totalTasks\": 1}" + }, + "externalUrl": "http://airflow.example.com/taskinstance/list/?flt1_dag_id_equals=sqlite_operator&_flt_3_task_id=cleanup_processed_costs", + "name": "cleanup_processed_costs", + "type": { + "string": "COMMAND" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "dataJobInputOutput", + "aspect": { + "json": { + "inputDatasets": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)" + ], + "outputDatasets": [], + "inputDatajobs": [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),transform_cost_table)" + ], + "fineGrainedLineages": [] + } + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:sqlite,public.processed_costs,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:airflow", + "type": "DEVELOPER", + "source": { + "type": "SERVICE" + } + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:airflow" + } + } + } +}, +{ + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(airflow,sqlite_operator,prod),cleanup_processed_costs)", + "changeType": "UPSERT", + "aspectName": "globalTags", + "aspect": { + "json": { + "tags": [] + } + } +}, +{ + "entityType": "dataProcessInstance", + "entityUrn": "urn:li:dataProcessInstance:bab908abccf3cd6607b50fdaf3003372", + "changeType": "UPSERT", + "aspectName": "dataProcessInstanceRunEvent", + "aspect": { + "json": { + "timestampMillis": 1697401673788, + "partitionSpec": { + "type": "FULL_TABLE", + "partition": "FULL_TABLE_SNAPSHOT" + }, + "status": "COMPLETE", + "result": { + "type": "SUCCESS", + "nativeResultType": "airflow" + } + } + } +} +] \ No newline at end of file diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py deleted file mode 100644 index 10cf3ad0a608ae..00000000000000 --- a/metadata-ingestion-modules/airflow-plugin/tests/integration/integration_test_dummy.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_dummy(): - pass diff --git a/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py b/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py new file mode 100644 index 00000000000000..a2b7fd151a1e41 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/integration/test_plugin.py @@ -0,0 +1,392 @@ +import contextlib +import dataclasses +import functools +import logging +import os +import pathlib +import random +import signal +import subprocess +import time +from typing import Iterator, Sequence + +import pytest +import requests +import tenacity +from airflow.models.connection import Connection +from datahub.testing.compare_metadata_json import assert_metadata_files_equal + +from datahub_airflow_plugin._airflow_shims import ( + HAS_AIRFLOW_DAG_LISTENER_API, + HAS_AIRFLOW_LISTENER_API, + HAS_AIRFLOW_STANDALONE_CMD, +) + +pytestmark = pytest.mark.integration + +logger = logging.getLogger(__name__) +IS_LOCAL = os.environ.get("CI", "false") == "false" + +DAGS_FOLDER = pathlib.Path(__file__).parent / "dags" +GOLDENS_FOLDER = pathlib.Path(__file__).parent / "goldens" + + +@dataclasses.dataclass +class AirflowInstance: + airflow_home: pathlib.Path + airflow_port: int + pid: int + env_vars: dict + + username: str + password: str + + metadata_file: pathlib.Path + + @property + def airflow_url(self) -> str: + return f"http://localhost:{self.airflow_port}" + + @functools.cached_property + def session(self) -> requests.Session: + session = requests.Session() + session.auth = (self.username, self.password) + return session + + +@tenacity.retry( + reraise=True, + wait=tenacity.wait_fixed(1), + stop=tenacity.stop_after_delay(60), + retry=tenacity.retry_if_exception_type( + (AssertionError, requests.exceptions.RequestException) + ), +) +def _wait_for_airflow_healthy(airflow_port: int) -> None: + print("Checking if Airflow is ready...") + res = requests.get(f"http://localhost:{airflow_port}/health", timeout=5) + res.raise_for_status() + + airflow_health = res.json() + assert airflow_health["metadatabase"]["status"] == "healthy" + assert airflow_health["scheduler"]["status"] == "healthy" + + +class NotReadyError(Exception): + pass + + +@tenacity.retry( + reraise=True, + wait=tenacity.wait_fixed(1), + stop=tenacity.stop_after_delay(90), + retry=tenacity.retry_if_exception_type(NotReadyError), +) +def _wait_for_dag_finish( + airflow_instance: AirflowInstance, dag_id: str, require_success: bool +) -> None: + print("Checking if DAG is finished") + res = airflow_instance.session.get( + f"{airflow_instance.airflow_url}/api/v1/dags/{dag_id}/dagRuns", timeout=5 + ) + res.raise_for_status() + + dag_runs = res.json()["dag_runs"] + if not dag_runs: + raise NotReadyError("No DAG runs found") + + dag_run = dag_runs[0] + if dag_run["state"] == "failed": + if require_success: + raise ValueError("DAG failed") + # else - success is not required, so we're done. + + elif dag_run["state"] != "success": + raise NotReadyError(f"DAG has not finished yet: {dag_run['state']}") + + +@contextlib.contextmanager +def _run_airflow( + tmp_path: pathlib.Path, dags_folder: pathlib.Path, is_v1: bool +) -> Iterator[AirflowInstance]: + airflow_home = tmp_path / "airflow_home" + print(f"Using airflow home: {airflow_home}") + + if IS_LOCAL: + airflow_port = 11792 + else: + airflow_port = random.randint(10000, 12000) + print(f"Using airflow port: {airflow_port}") + + datahub_connection_name = "datahub_file_default" + meta_file = tmp_path / "datahub_metadata.json" + + environment = { + **os.environ, + "AIRFLOW_HOME": str(airflow_home), + "AIRFLOW__WEBSERVER__WEB_SERVER_PORT": str(airflow_port), + "AIRFLOW__WEBSERVER__BASE_URL": "http://airflow.example.com", + # Point airflow to the DAGs folder. + "AIRFLOW__CORE__LOAD_EXAMPLES": "False", + "AIRFLOW__CORE__DAGS_FOLDER": str(dags_folder), + "AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION": "False", + # Have the Airflow API use username/password authentication. + "AIRFLOW__API__AUTH_BACKEND": "airflow.api.auth.backend.basic_auth", + # Configure the datahub plugin and have it write the MCPs to a file. + "AIRFLOW__CORE__LAZY_LOAD_PLUGINS": "False" if is_v1 else "True", + "AIRFLOW__DATAHUB__CONN_ID": datahub_connection_name, + f"AIRFLOW_CONN_{datahub_connection_name.upper()}": Connection( + conn_id="datahub_file_default", + conn_type="datahub-file", + host=str(meta_file), + ).get_uri(), + # Configure fake credentials for the Snowflake connection. + "AIRFLOW_CONN_MY_SNOWFLAKE": Connection( + conn_id="my_snowflake", + conn_type="snowflake", + login="fake_username", + password="fake_password", + schema="DATAHUB_TEST_SCHEMA", + extra={ + "account": "fake_account", + "database": "DATAHUB_TEST_DATABASE", + "warehouse": "fake_warehouse", + "role": "fake_role", + "insecure_mode": "true", + }, + ).get_uri(), + "AIRFLOW_CONN_MY_SQLITE": Connection( + conn_id="my_sqlite", + conn_type="sqlite", + host=str(tmp_path / "my_sqlite.db"), + ).get_uri(), + # Convenience settings. + "AIRFLOW__DATAHUB__LOG_LEVEL": "DEBUG", + "AIRFLOW__DATAHUB__DEBUG_EMITTER": "True", + "SQLALCHEMY_SILENCE_UBER_WARNING": "1", + } + + if not HAS_AIRFLOW_STANDALONE_CMD: + raise pytest.skip("Airflow standalone command is not available") + + # Start airflow in a background subprocess. + airflow_process = subprocess.Popen( + ["airflow", "standalone"], + env=environment, + ) + + try: + _wait_for_airflow_healthy(airflow_port) + print("Airflow is ready!") + + # Sleep for a few seconds to make sure the other Airflow processes are ready. + time.sleep(3) + + # Create an extra "airflow" user for easy testing. + if IS_LOCAL: + print("Creating an extra test user...") + subprocess.check_call( + [ + # fmt: off + "airflow", "users", "create", + "--username", "airflow", + "--password", "airflow", + "--firstname", "admin", + "--lastname", "admin", + "--role", "Admin", + "--email", "airflow@example.com", + # fmt: on + ], + env=environment, + ) + + # Sanity check that the plugin got loaded. + if not is_v1: + print("[debug] Listing loaded plugins") + subprocess.check_call( + ["airflow", "plugins", "-v"], + env=environment, + ) + + # Load the admin user's password. This is generated by the + # `airflow standalone` command, and is different from the + # airflow user that we create when running locally. + airflow_username = "admin" + airflow_password = (airflow_home / "standalone_admin_password.txt").read_text() + + airflow_instance = AirflowInstance( + airflow_home=airflow_home, + airflow_port=airflow_port, + pid=airflow_process.pid, + env_vars=environment, + username=airflow_username, + password=airflow_password, + metadata_file=meta_file, + ) + + yield airflow_instance + finally: + try: + # Attempt a graceful shutdown. + print("Shutting down airflow...") + airflow_process.send_signal(signal.SIGINT) + airflow_process.wait(timeout=30) + except subprocess.TimeoutExpired: + # If the graceful shutdown failed, kill the process. + print("Hard shutting down airflow...") + airflow_process.kill() + airflow_process.wait(timeout=3) + + +def check_golden_file( + pytestconfig: pytest.Config, + output_path: pathlib.Path, + golden_path: pathlib.Path, + ignore_paths: Sequence[str] = (), +) -> None: + update_golden = pytestconfig.getoption("--update-golden-files") + + assert_metadata_files_equal( + output_path=output_path, + golden_path=golden_path, + update_golden=update_golden, + copy_output=False, + ignore_paths=ignore_paths, + ignore_order=False, + ) + + +@dataclasses.dataclass +class DagTestCase: + dag_id: str + success: bool = True + + v2_only: bool = False + + +test_cases = [ + DagTestCase("simple_dag"), + DagTestCase("basic_iolets"), + DagTestCase("snowflake_operator", success=False, v2_only=True), + DagTestCase("sqlite_operator", v2_only=True), +] + + +@pytest.mark.parametrize( + ["golden_filename", "test_case", "is_v1"], + [ + # On Airflow <= 2.2, test plugin v1. + *[ + pytest.param( + f"v1_{test_case.dag_id}", + test_case, + True, + id=f"v1_{test_case.dag_id}", + marks=pytest.mark.skipif( + HAS_AIRFLOW_LISTENER_API, + reason="Not testing plugin v1 on newer Airflow versions", + ), + ) + for test_case in test_cases + if not test_case.v2_only + ], + *[ + pytest.param( + # On Airflow 2.3-2.4, test plugin v2 without dataFlows. + f"v2_{test_case.dag_id}" + if HAS_AIRFLOW_DAG_LISTENER_API + else f"v2_{test_case.dag_id}_no_dag_listener", + test_case, + False, + id=f"v2_{test_case.dag_id}" + if HAS_AIRFLOW_DAG_LISTENER_API + else f"v2_{test_case.dag_id}_no_dag_listener", + marks=pytest.mark.skipif( + not HAS_AIRFLOW_LISTENER_API, + reason="Cannot test plugin v2 without the Airflow plugin listener API", + ), + ) + for test_case in test_cases + ], + ], +) +def test_airflow_plugin( + pytestconfig: pytest.Config, + tmp_path: pathlib.Path, + golden_filename: str, + test_case: DagTestCase, + is_v1: bool, +) -> None: + # This test: + # - Configures the plugin. + # - Starts a local airflow instance in a subprocess. + # - Runs a DAG that uses an operator supported by the extractor. + # - Waits for the DAG to complete. + # - Validates the metadata generated against a golden file. + + if not is_v1 and not test_case.success and not HAS_AIRFLOW_DAG_LISTENER_API: + # Saw a number of issues in CI where this would fail to emit the last events + # due to an error in the SQLAlchemy listener. This never happened locally for me. + pytest.skip("Cannot test failure cases without the Airflow DAG listener API") + + golden_path = GOLDENS_FOLDER / f"{golden_filename}.json" + dag_id = test_case.dag_id + + with _run_airflow( + tmp_path, dags_folder=DAGS_FOLDER, is_v1=is_v1 + ) as airflow_instance: + print(f"Running DAG {dag_id}...") + subprocess.check_call( + [ + "airflow", + "dags", + "trigger", + "--exec-date", + "2023-09-27T21:34:38+00:00", + "-r", + "manual_run_test", + dag_id, + ], + env=airflow_instance.env_vars, + ) + + print("Waiting for DAG to finish...") + _wait_for_dag_finish( + airflow_instance, dag_id, require_success=test_case.success + ) + + print("Sleeping for a few seconds to let the plugin finish...") + time.sleep(10) + + check_golden_file( + pytestconfig=pytestconfig, + output_path=airflow_instance.metadata_file, + golden_path=golden_path, + ignore_paths=[ + # Timing-related items. + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['start_date'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['end_date'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['duration'\]", + # Host-specific items. + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['pid'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['hostname'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['unixname'\]", + # TODO: If we switched to Git urls, maybe we could get this to work consistently. + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['fileloc'\]", + r"root\[\d+\]\['aspect'\]\['json'\]\['customProperties'\]\['openlineage_.*'\]", + ], + ) + + +if __name__ == "__main__": + # When run directly, just set up a local airflow instance. + import tempfile + + with _run_airflow( + tmp_path=pathlib.Path(tempfile.mkdtemp("airflow-plugin-test")), + dags_folder=DAGS_FOLDER, + is_v1=not HAS_AIRFLOW_LISTENER_API, + ) as airflow_instance: + # input("Press enter to exit...") + breakpoint() + print("quitting airflow") diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py index 9aa901171cfa65..d8620e74d7e305 100644 --- a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_airflow.py @@ -14,18 +14,21 @@ import pytest from airflow.lineage import apply_lineage, prepare_lineage from airflow.models import DAG, Connection, DagBag, DagRun, TaskInstance -from datahub_provider import get_provider_info -from datahub_provider._airflow_shims import AIRFLOW_PATCHED, EmptyOperator -from datahub_provider.entities import Dataset, Urn -from datahub_provider.hooks.datahub import DatahubKafkaHook, DatahubRestHook -from datahub_provider.operators.datahub import DatahubEmitterOperator + +from datahub_airflow_plugin import get_provider_info +from datahub_airflow_plugin._airflow_shims import ( + AIRFLOW_PATCHED, + AIRFLOW_VERSION, + EmptyOperator, +) +from datahub_airflow_plugin.entities import Dataset, Urn +from datahub_airflow_plugin.hooks.datahub import DatahubKafkaHook, DatahubRestHook +from datahub_airflow_plugin.operators.datahub import DatahubEmitterOperator assert AIRFLOW_PATCHED # TODO: Remove default_view="tree" arg. Figure out why is default_view being picked as "grid" and how to fix it ? -# Approach suggested by https://stackoverflow.com/a/11887885/5004662. -AIRFLOW_VERSION = packaging.version.parse(airflow.version.version) lineage_mce = builder.make_lineage_mce( [ @@ -105,7 +108,7 @@ def test_datahub_rest_hook(mock_emitter): mock_emitter.assert_called_once_with(config.host, None, None) instance = mock_emitter.return_value - instance.emit_mce.assert_called_with(lineage_mce) + instance.emit.assert_called_with(lineage_mce) @mock.patch("datahub.emitter.rest_emitter.DatahubRestEmitter", autospec=True) @@ -119,7 +122,7 @@ def test_datahub_rest_hook_with_timeout(mock_emitter): mock_emitter.assert_called_once_with(config.host, None, 5) instance = mock_emitter.return_value - instance.emit_mce.assert_called_with(lineage_mce) + instance.emit.assert_called_with(lineage_mce) @mock.patch("datahub.emitter.kafka_emitter.DatahubKafkaEmitter", autospec=True) @@ -131,11 +134,11 @@ def test_datahub_kafka_hook(mock_emitter): mock_emitter.assert_called_once() instance = mock_emitter.return_value - instance.emit_mce_async.assert_called() + instance.emit.assert_called() instance.flush.assert_called_once() -@mock.patch("datahub_provider.hooks.datahub.DatahubRestHook.emit_mces") +@mock.patch("datahub_provider.hooks.datahub.DatahubRestHook.emit") def test_datahub_lineage_operator(mock_emit): with patch_airflow_connection(datahub_rest_connection_config) as config: assert config.conn_id diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py deleted file mode 100644 index 10cf3ad0a608ae..00000000000000 --- a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_dummy.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_dummy(): - pass diff --git a/metadata-ingestion-modules/airflow-plugin/tests/unit/test_packaging.py b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_packaging.py new file mode 100644 index 00000000000000..1d0ce5835f9582 --- /dev/null +++ b/metadata-ingestion-modules/airflow-plugin/tests/unit/test_packaging.py @@ -0,0 +1,8 @@ +import setuptools + + +def test_package_list_match_inits(): + where = "./src" + package_list = set(setuptools.find_packages(where)) + namespace_packages = set(setuptools.find_namespace_packages(where)) + assert package_list == namespace_packages, "are you missing a package init file?" diff --git a/metadata-ingestion-modules/airflow-plugin/tox.ini b/metadata-ingestion-modules/airflow-plugin/tox.ini index 6a1c06aed8cdd5..2f05854940d104 100644 --- a/metadata-ingestion-modules/airflow-plugin/tox.ini +++ b/metadata-ingestion-modules/airflow-plugin/tox.ini @@ -4,32 +4,23 @@ # and then run "tox" from this directory. [tox] -envlist = py3-quick,py3-full - -[gh-actions] -python = - 3.6: py3-full - 3.9: py3-full - -# Providing optional features that add dependencies from setup.py as deps here -# allows tox to recreate testenv when new dependencies are added to setup.py. -# Previous approach of using the tox global setting extras is not recommended -# as extras is only called when the testenv is created for the first time! -# see more here -> https://github.com/tox-dev/tox/issues/1105#issuecomment-448596282 +envlist = py38-airflow21, py38-airflow22, py310-airflow24, py310-airflow26, py310-airflow27 [testenv] -deps = - -e ../../metadata-ingestion/[.dev] +use_develop = true +extras = dev,integration-tests,plugin-v1 +deps = + -e ../../metadata-ingestion/ + # Airflow version + airflow21: apache-airflow~=2.1.0 + airflow22: apache-airflow~=2.2.0 + airflow24: apache-airflow~=2.4.0 + airflow26: apache-airflow~=2.6.0 + airflow27: apache-airflow~=2.7.0 commands = - pytest --cov={envsitepackagesdir}/datahub --cov={envsitepackagesdir}/datahub_provider \ - py3-quick: -m 'not integration and not slow_integration' --junit-xml=junit.quick.xml \ - py3-full: --cov-fail-under 65 --junit-xml=junit.full.xml \ - --continue-on-collection-errors \ - -vv + pytest --cov-append {posargs} -setenv = - AIRFLOW_HOME = /tmp/airflow/thisshouldnotexist-{envname} +# For Airflow 2.4+, add the plugin-v2 extra. +[testenv:py310-airflow{24,26,27}] +extras = dev,integration-tests,plugin-v2 -[testenv:py3-full] -deps = - ../../metadata-ingestion/.[dev] diff --git a/metadata-ingestion/adding-source.md b/metadata-ingestion/adding-source.md index e4fc950a7cdbd0..a0930102c6827c 100644 --- a/metadata-ingestion/adding-source.md +++ b/metadata-ingestion/adding-source.md @@ -62,7 +62,7 @@ Some sources use the default `SourceReport` class, but others inherit and extend ### 3. Implement the source itself -The core for the source is the `get_workunits` method, which produces a stream of metadata events (typically MCP objects) wrapped up in a MetadataWorkUnit. +The core for the source is the `get_workunits_internal` method, which produces a stream of metadata events (typically MCP objects) wrapped up in a MetadataWorkUnit. The [file source](./src/datahub/ingestion/source/file.py) is a good and simple example. The MetadataChangeEventClass is defined in the metadata models which are generated diff --git a/metadata-ingestion/docs/dev_guides/profiling_ingestions.md b/metadata-ingestion/docs/dev_guides/profiling_ingestions.md new file mode 100644 index 00000000000000..77cc2f456aa2da --- /dev/null +++ b/metadata-ingestion/docs/dev_guides/profiling_ingestions.md @@ -0,0 +1,94 @@ +import FeatureAvailability from '@site/src/components/FeatureAvailability'; +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; + +# Profiling ingestions + + + +**🤝 Version compatibility** +> Open Source DataHub: **0.11.1** | Acryl: **0.2.12** + +This page documents how to perform memory profiles of ingestion runs. +It is useful when trying to size the amount of resources necessary to ingest some source or when developing new features or sources. + +## How to use + + + + +Create an ingestion as specified in the [Ingestion guide](../../../docs/ui-ingestion.md). + +Add a flag to your ingestion recipe to generate a memray memory dump of your ingestion: +```yaml +source: + ... + +sink: + ... + +flags: + generate_memory_profiles: "" +``` + +In the final panel, under the advanced section, add the `debug` datahub package under the **Extra DataHub Plugins** section. +As seen below: + +

+ +

+ +Finally, save and run the ingestion process. + +
+ +Install the `debug` plugin for DataHub's CLI wherever the ingestion runs: + +```bash +pip install 'acryl-datahub[debug]' +``` + +This will install [memray](https://github.com/bloomberg/memray) in your python environment. + +Add a flag to your ingestion recipe to generate a memray memory dump of your ingestion: +```yaml +source: + ... + +sink: + ... + +flags: + generate_memory_profiles: "" +``` + +Finally run the ingestion recipe + +```bash +$ datahub ingest -c recipe.yaml +``` + + +
+ + +Once the ingestion run starts a binary file will be created and appended to during the execution of the ingestion. + +These files follow the pattern `file-.bin` for a unique identification. +Once the ingestion has finished you can use `memray` to analyze the memory dump in a flamegraph view using: + +```$ memray flamegraph file-None-file-2023_09_18-21_38_43.bin``` + +This will generate an interactive HTML file for analysis: + +

+ +

+ + +`memray` has an extensive set of features for memory investigation. Take a look at their [documentation](https://bloomberg.github.io/memray/overview.html) to see the full feature set. + + +## Questions + +If you've got any questions on configuring profiling, feel free to ping us on [our Slack](https://slack.datahubproject.io/)! diff --git a/metadata-ingestion/docs/sources/athena/athena_pre.md b/metadata-ingestion/docs/sources/athena/athena_pre.md new file mode 100644 index 00000000000000..a56457d3f84fcf --- /dev/null +++ b/metadata-ingestion/docs/sources/athena/athena_pre.md @@ -0,0 +1,72 @@ +### Prerequisities + +In order to execute this source, you will need to create a policy with below permissions and attach it to the the aws role or credentials used in ingestion recipe. + +```json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "athena:GetTableMetadata", + "athena:StartQueryExecution", + "athena:GetQueryResults", + "athena:GetDatabase", + "athena:ListDataCatalogs", + "athena:GetDataCatalog", + "athena:ListQueryExecutions", + "athena:GetWorkGroup", + "athena:StopQueryExecution", + "athena:GetQueryResultsStream", + "athena:ListDatabases", + "athena:GetQueryExecution", + "athena:ListTableMetadata", + "athena:BatchGetQueryExecution", + "glue:GetTables", + "glue:GetDatabases", + "glue:GetTable", + "glue:GetDatabase", + "glue:SearchTables", + "glue:GetTableVersions", + "glue:GetTableVersion", + "glue:GetPartition", + "glue:GetPartitions", + "s3:GetObject", + "s3:ListBucket", + "s3:GetBucketLocation", + ], + "Resource": [ + "arn:aws:athena:${region-id}:${account-id}:datacatalog/*", + "arn:aws:athena:${region-id}:${account-id}:workgroup/*", + "arn:aws:glue:${region-id}:${account-id}:tableVersion/*/*/*", + "arn:aws:glue:${region-id}:${account-id}:table/*/*", + "arn:aws:glue:${region-id}:${account-id}:catalog", + "arn:aws:glue:${region-id}:${account-id}:database/*", + "arn:aws:s3:::${datasets-bucket}", + "arn:aws:s3:::${datasets-bucket}/*" + ] + }, + { + "Sid": "VisualEditor1", + "Effect": "Allow", + "Action": [ + "s3:PutObject", + "s3:GetObject", + "s3:ListBucketMultipartUploads", + "s3:AbortMultipartUpload", + "s3:ListBucket", + "s3:GetBucketLocation", + "s3:ListMultipartUploadParts" + ], + "Resource": [ + "arn:aws:s3:::${athena-query-result-bucket}/*", + "arn:aws:s3:::${athena-query-result-bucket}" + ] + }, + ] +} +``` + +Replace `${var}` with appropriate values as per your athena setup. \ No newline at end of file diff --git a/metadata-ingestion/docs/sources/databricks/unity-catalog_pre.md b/metadata-ingestion/docs/sources/databricks/unity-catalog_pre.md index 2be8846b87bea2..ae2883343d7e8f 100644 --- a/metadata-ingestion/docs/sources/databricks/unity-catalog_pre.md +++ b/metadata-ingestion/docs/sources/databricks/unity-catalog_pre.md @@ -13,6 +13,7 @@ * Ownership of or `SELECT` privilege on any tables and views you want to ingest * [Ownership documentation](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/ownership.html) * [Privileges documentation](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html) + + To ingest your workspace's notebooks and respective lineage, your service principal must have `CAN_READ` privileges on the folders containing the notebooks you want to ingest: [guide](https://docs.databricks.com/en/security/auth-authz/access-control/workspace-acl.html#folder-permissions). + To `include_usage_statistics` (enabled by default), your service principal must have `CAN_MANAGE` permissions on any SQL Warehouses you want to ingest: [guide](https://docs.databricks.com/security/auth-authz/access-control/sql-endpoint-acl.html). + To ingest `profiling` information with `call_analyze` (enabled by default), your service principal must have ownership or `MODIFY` privilege on any tables you want to profile. * Alternatively, you can run [ANALYZE TABLE](https://docs.databricks.com/sql/language-manual/sql-ref-syntax-aux-analyze-table.html) yourself on any tables you want to profile, then set `call_analyze` to `false`. diff --git a/metadata-ingestion/docs/sources/dbt/dbt-cloud_recipe.yml b/metadata-ingestion/docs/sources/dbt/dbt-cloud_recipe.yml index 113303cfc1ad40..ef0776b189ca9d 100644 --- a/metadata-ingestion/docs/sources/dbt/dbt-cloud_recipe.yml +++ b/metadata-ingestion/docs/sources/dbt/dbt-cloud_recipe.yml @@ -6,14 +6,14 @@ source: # In the URL https://cloud.getdbt.com/next/deploy/107298/projects/175705/jobs/148094, # 107298 is the account_id, 175705 is the project_id, and 148094 is the job_id - account_id: # set to your dbt cloud account id - project_id: # set to your dbt cloud project id - job_id: # set to your dbt cloud job id + account_id: "${DBT_ACCOUNT_ID}" # set to your dbt cloud account id + project_id: "${DBT_PROJECT_ID}" # set to your dbt cloud project id + job_id: "${DBT_JOB_ID}" # set to your dbt cloud job id run_id: # set to your dbt cloud run id. This is optional, and defaults to the latest run target_platform: postgres # Options - target_platform: "my_target_platform_id" # e.g. bigquery/postgres/etc. + target_platform: "${TARGET_PLATFORM_ID}" # e.g. bigquery/postgres/etc. # sink configs diff --git a/metadata-ingestion/docs/sources/dbt/dbt.md b/metadata-ingestion/docs/sources/dbt/dbt.md index bfc3ebd5bb350b..43ced13c3b1f8d 100644 --- a/metadata-ingestion/docs/sources/dbt/dbt.md +++ b/metadata-ingestion/docs/sources/dbt/dbt.md @@ -38,6 +38,12 @@ meta_mapping: operation: "add_terms" config: separator: "," + documentation_link: + match: "(?:https?)?\:\/\/\w*[^#]*" + operation: "add_doc_link" + config: + link: {{ $match }} + description: "Documentation Link" column_meta_mapping: terms_list: match: ".*" @@ -57,6 +63,7 @@ We support the following operations: 2. add_term - Requires `term` property in config. 3. add_terms - Accepts an optional `separator` property in config. 4. add_owner - Requires `owner_type` property in config which can be either user or group. Optionally accepts the `owner_category` config property which you can set to one of `['TECHNICAL_OWNER', 'BUSINESS_OWNER', 'DATA_STEWARD', 'DATAOWNER'` (defaults to `DATAOWNER`). +5. add_doc_link - Requires `link` and `description` properties in config. Upon ingestion run, this will overwrite current links in the institutional knowledge section with this new link. The anchor text is defined here in the meta_mappings as `description`. Note: diff --git a/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md b/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md index 7f9a0324c7bc64..a1c0a6e2d4d214 100644 --- a/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md +++ b/metadata-ingestion/docs/sources/dynamodb/dynamodb_post.md @@ -1,21 +1,18 @@ -## Limitations - -For each region, the list table operation returns maximum number 100 tables, we need to further improve it by implementing pagination for listing tables - ## Advanced Configurations ### Using `include_table_item` config -If there are items that have most representative fields of the table, user could use the `include_table_item` option to provide a list of primary keys of a table in dynamodb format, those items from given primary keys will be included when we scan the table. +If there are items that have most representative fields of the table, users could use the `include_table_item` option to provide a list of primary keys of the table in dynamodb format. We include these items in addition to the first 100 items in the table when we scan it. -Take [AWS DynamoDB Developer Guide Example tables and data](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AppendixSampleTables.html) as an example, if user has a table `Reply` with composite primary key `Id` and `ReplyDateTime`, user can use `include_table_item` to include 2 items as following: +Take [AWS DynamoDB Developer Guide Example tables and data](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/AppendixSampleTables.html) as an example, if a account has a table `Reply` in the `us-west-2` region with composite primary key `Id` and `ReplyDateTime`, users can use `include_table_item` to include 2 items as following: Example: ```yml -# put the table name and composite key in DynamoDB format +# The table name should be in the format of region.table_name +# The primary keys should be in the DynamoDB format include_table_item: - Reply: + us-west-2.Reply: [ { "ReplyDateTime": { "S": "2015-09-22T19:58:22.947Z" }, diff --git a/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md b/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md index a48e8d5be04aa0..598d0ecdb3786b 100644 --- a/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md +++ b/metadata-ingestion/docs/sources/dynamodb/dynamodb_pre.md @@ -1,8 +1,8 @@ ### Prerequisities -In order to execute this source, you will need to create access key and secret keys that have DynamoDB read access. You can create these policies and attach to your account or can ask your account admin to attach these policies to your account. +In order to execute this source, you need to attach the `AmazonDynamoDBReadOnlyAccess` policy to a user in your AWS account. Then create an API access key and secret for the user. -For access key permissions, you can create a policy with permissions below and attach to your account, you can find more details in [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html) +For a user to be able to create API access key, it needs the following access key permissions. Your AWS account admin can create a policy with these permissions and attach to the user, you can find more details in [Managing access keys for IAM users](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_credentials_access-keys.html) ```json { @@ -22,5 +22,3 @@ For access key permissions, you can create a policy with permissions below and a ] } ``` - -For DynamoDB read access, you can simply attach AWS managed policy `AmazonDynamoDBReadOnlyAccess` to your account, you can find more details in [Attaching a policy to an IAM user group](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_groups_manage_attach-policy.html) diff --git a/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml b/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml index bd41637907b5c9..4f4edc9a7d496e 100644 --- a/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml +++ b/metadata-ingestion/docs/sources/dynamodb/dynamodb_recipe.yml @@ -4,16 +4,14 @@ source: platform_instance: "AWS_ACCOUNT_ID" aws_access_key_id: "${AWS_ACCESS_KEY_ID}" aws_secret_access_key: "${AWS_SECRET_ACCESS_KEY}" - # User could use the below option to provide a list of primary keys of a table in dynamodb format, - # those items from given primary keys will be included when we scan the table. - # For each table we can retrieve up to 16 MB of data, which can contain as many as 100 items. - # We'll enforce the the primary keys list size not to exceed 100 - # The total items we'll try to retrieve in these two scenarios: - # 1. If user don't specify include_table_item: we'll retrieve up to 100 items - # 2. If user specifies include_table_item: we'll retrieve up to 100 items plus user specified items in - # the table, with a total not more than 200 items + # + # If there are items that have most representative fields of the table, users could use the + # `include_table_item` option to provide a list of primary keys of the table in dynamodb format. + # For each `region.table`, the list of primary keys can be at most 100. + # We include these items in addition to the first 100 items in the table when we scan it. + # # include_table_item: - # table_name: + # region.table_name: # [ # { # "partition_key_name": { "attribute_type": "attribute_value" }, diff --git a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md index 0323e214045aef..fcfae6cd1e6d78 100644 --- a/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md +++ b/metadata-ingestion/docs/sources/powerbi/powerbi_pre.md @@ -40,7 +40,7 @@ PowerBI Source supports M-Query expression for below listed PowerBI Data Sources 4. Microsoft SQL Server 5. Google BigQuery -Native SQL query parsing is supported for `Snowflake` and `Amazon Redshift` data-sources and only first table from `FROM` clause will be ingested as upstream table. Advance SQL construct like JOIN and SUB-QUERIES in `FROM` clause are not supported. +Native SQL query parsing is supported for `Snowflake` and `Amazon Redshift` data-sources. For example refer below native SQL query. The table `OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_UNIT_TARGET` will be ingested as upstream table. diff --git a/metadata-ingestion/docs/sources/teradata/teradata_pre.md b/metadata-ingestion/docs/sources/teradata/teradata_pre.md new file mode 100644 index 00000000000000..7b4da1255d5759 --- /dev/null +++ b/metadata-ingestion/docs/sources/teradata/teradata_pre.md @@ -0,0 +1,28 @@ +### Prerequisites +1. Create a user which has access to the database you want to ingest. + ```sql + CREATE USER datahub FROM AS PASSWORD = PERM = 20000000; + ``` +2. Create a user with the following privileges: + ```sql + GRANT SELECT ON dbc.columns TO datahub; + GRANT SELECT ON dbc.databases TO datahub; + GRANT SELECT ON dbc.tables TO datahub; + GRANT SELECT ON DBC.All_RI_ChildrenV TO datahub; + GRANT SELECT ON DBC.ColumnsV TO datahub; + GRANT SELECT ON DBC.IndicesV TO datahub; + GRANT SELECT ON dbc.TableTextV TO datahub; + GRANT SELECT ON dbc.TablesV TO datahub; + GRANT SELECT ON dbc.dbqlogtbl TO datahub; -- if lineage or usage extraction is enabled + ``` + + If you want to run profiling, you need to grant select permission on all the tables you want to profile. + +3. If lineage or usage extraction is enabled, please, check if query logging is enabled and it is set to size which +will fit for your queries (the default query text size Teradata captures is max 200 chars) + An example how you can set it for all users: + ```sql + REPLACE QUERY LOGGING LIMIT SQLTEXT=2000 ON ALL; + ``` + See more here about query logging: + [https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Database-Reference/Database-Administration/Tracking-Query-Behavior-with-Database-Query-Logging-Operational-DBAs](https://docs.teradata.com/r/Teradata-VantageCloud-Lake/Database-Reference/Database-Administration/Tracking-Query-Behavior-with-Database-Query-Logging-Operational-DBAs) diff --git a/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml b/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml new file mode 100644 index 00000000000000..cc94de20110fe1 --- /dev/null +++ b/metadata-ingestion/docs/sources/teradata/teradata_recipe.yml @@ -0,0 +1,16 @@ +pipeline_name: my-teradata-ingestion-pipeline +source: + type: teradata + config: + host_port: "myteradatainstance.teradata.com:1025" + username: myuser + password: mypassword + #database_pattern: + # allow: + # - "my_database" + # ignoreCase: true + include_table_lineage: true + include_usage_statistics: true + stateful_ingestion: + enabled: true +sink: diff --git a/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml b/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml new file mode 100644 index 00000000000000..c73904403f678d --- /dev/null +++ b/metadata-ingestion/examples/data_contract/pet_of_the_week.dhub.dc.yaml @@ -0,0 +1,21 @@ +# id: pet_details_dc # Optional: This is the unique identifier for the data contract +display_name: Data Contract for SampleHiveDataset +entity: urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD) +freshness: + time: 0700 + granularity: DAILY +schema: + properties: + field_foo: + type: string + native_type: VARCHAR(100) + field_bar: + type: boolean + required: + - field_bar +data_quality: + - type: column_range + config: + column: field_foo + min: 0 + max: 100 diff --git a/metadata-ingestion/examples/library/create_dataproduct.py b/metadata-ingestion/examples/library/create_dataproduct.py new file mode 100644 index 00000000000000..245395b6024803 --- /dev/null +++ b/metadata-ingestion/examples/library/create_dataproduct.py @@ -0,0 +1,25 @@ +from datahub.api.entities.dataproduct.dataproduct import DataProduct +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph + +gms_endpoint = "http://localhost:8080" +graph = DataHubGraph(DatahubClientConfig(server=gms_endpoint)) + +data_product = DataProduct( + id="pet_of_the_week", + display_name="Pet of the Week Campagin", + domain="urn:li:domain:ef39e99a-9d61-406d-b4a8-c70b16380206", + description="This campaign includes Pet of the Week data.", + assets=[ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.pet_details,PROD)", + "urn:li:dashboard:(looker,baz)", + "urn:li:dataFlow:(airflow,dag_abc,PROD)", + ], + owners=[{"id": "urn:li:corpuser:jdoe", "type": "BUSINESS_OWNER"}], + terms=["urn:li:glossaryTerm:ClientsAndAccounts.AccountBalance"], + tags=["urn:li:tag:adoption"], + properties={"lifecycle": "production", "sla": "7am every day"}, + external_url="https://en.wikipedia.org/wiki/Sloth", +) + +for mcp in data_product.generate_mcp(upsert=False): + graph.emit(mcp) diff --git a/metadata-ingestion/examples/library/read_lineage_rest.py b/metadata-ingestion/examples/library/read_lineage_rest.py index 34437ed86280dc..bd9b4e8651dba9 100644 --- a/metadata-ingestion/examples/library/read_lineage_rest.py +++ b/metadata-ingestion/examples/library/read_lineage_rest.py @@ -6,7 +6,7 @@ # Query multiple aspects from entity query = """ -mutation searchAcrossLineage { +query searchAcrossLineage { searchAcrossLineage( input: { query: "*" diff --git a/metadata-ingestion/scripts/avro_codegen.py b/metadata-ingestion/scripts/avro_codegen.py index a9b9b4b20f5ac8..021ebd4a31eb3a 100644 --- a/metadata-ingestion/scripts/avro_codegen.py +++ b/metadata-ingestion/scripts/avro_codegen.py @@ -152,7 +152,8 @@ def add_name(self, name_attr, space_attr, new_schema): return encoded -autogen_header = """# flake8: noqa +autogen_header = """# mypy: ignore-errors +# flake8: noqa # This file is autogenerated by /metadata-ingestion/scripts/avro_codegen.py # Do not modify manually! diff --git a/metadata-ingestion/scripts/modeldocgen.py b/metadata-ingestion/scripts/modeldocgen.py index ffa80515dbafd3..81b26145e620c9 100644 --- a/metadata-ingestion/scripts/modeldocgen.py +++ b/metadata-ingestion/scripts/modeldocgen.py @@ -351,8 +351,8 @@ def strip_types(field_path: str) -> str: field_objects = [] for f in entity_fields: field = avro.schema.Field( - type=f["type"], - name=f["name"], + f["type"], + f["name"], has_default=False, ) field_objects.append(field) diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index 8fb7b5f29cc229..afce8dcee840b4 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -1,4 +1,3 @@ -import os import sys from typing import Dict, Set @@ -9,16 +8,9 @@ exec(fp.read(), package_metadata) -def get_long_description(): - root = os.path.dirname(__file__) - with open(os.path.join(root, "README.md")) as f: - description = f.read() - - return description - - base_requirements = { - "typing_extensions>=3.10.0.2", + # Typing extension should be >=3.10.0.2 ideally but we can't restrict due to a Airflow 2.1 dependency conflict. + "typing_extensions>=3.7.4.3", "mypy_extensions>=0.4.3", # Actual dependencies. "typing-inspect", @@ -40,13 +32,12 @@ def get_long_description(): "expandvars>=0.6.5", "avro-gen3==0.7.11", # "avro-gen3 @ git+https://github.com/acryldata/avro_gen@master#egg=avro-gen3", - "avro>=1.10.2,<1.11", + "avro>=1.11.3,<1.12", "python-dateutil>=2.8.0", "tabulate", "progressbar2", "termcolor>=1.0.0", "psutil>=5.8.0", - "ratelimiter", "Deprecated", "humanfriendly", "packaging", @@ -110,22 +101,36 @@ def get_long_description(): "grpcio-tools>=1.44.0,<2", } -sql_common = { - # Required for all SQL sources. - # This is temporary lower bound that we're open to loosening/tightening as requirements show up - "sqlalchemy>=1.4.39, <2", - # Required for SQL profiling. - "great-expectations>=0.15.12, <=0.15.50", - # scipy version restricted to reduce backtracking, used by great-expectations, - "scipy>=1.7.2", - # GE added handling for higher version of jinja2 - # https://github.com/great-expectations/great_expectations/pull/5382/files - # datahub does not depend on traitlets directly but great expectations does. - # https://github.com/ipython/traitlets/issues/741 - "traitlets<5.2.2", - "greenlet", +usage_common = { + "sqlparse", } +sqlglot_lib = { + # Using an Acryl fork of sqlglot. + # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 + "acryl-sqlglot==19.0.2.dev10", +} + +sql_common = ( + { + # Required for all SQL sources. + # This is temporary lower bound that we're open to loosening/tightening as requirements show up + "sqlalchemy>=1.4.39, <2", + # Required for SQL profiling. + "great-expectations>=0.15.12, <=0.15.50", + # scipy version restricted to reduce backtracking, used by great-expectations, + "scipy>=1.7.2", + # GE added handling for higher version of jinja2 + # https://github.com/great-expectations/great_expectations/pull/5382/files + # datahub does not depend on traitlets directly but great expectations does. + # https://github.com/ipython/traitlets/issues/741 + "traitlets<5.2.2", + "greenlet", + } + | usage_common + | sqlglot_lib +) + sqllineage_lib = { "sqllineage==1.3.8", # We don't have a direct dependency on sqlparse but it is a dependency of sqllineage. @@ -134,12 +139,6 @@ def get_long_description(): "sqlparse==0.4.4", } -sqlglot_lib = { - # Using an Acryl fork of sqlglot. - # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 - "acryl-sqlglot==18.5.2.dev45", -} - aws_common = { # AWS Python SDK "boto3", @@ -174,7 +173,9 @@ def get_long_description(): clickhouse_common = { # Clickhouse 0.2.0 adds support for SQLAlchemy 1.4.x - "clickhouse-sqlalchemy>=0.2.0", + # Disallow 0.2.5 because of https://github.com/xzkostyan/clickhouse-sqlalchemy/issues/272. + # Note that there's also a known issue around nested map types: https://github.com/xzkostyan/clickhouse-sqlalchemy/issues/269. + "clickhouse-sqlalchemy>=0.2.0,<0.2.5", } redshift_common = { @@ -252,13 +253,9 @@ def get_long_description(): powerbi_report_server = {"requests", "requests_ntlm"} -usage_common = { - "sqlparse", -} - databricks = { # 0.1.11 appears to have authentication issues with azure databricks - "databricks-sdk>=0.1.1, != 0.1.11", + "databricks-sdk>=0.9.0", "pyspark", "requests", } @@ -270,6 +267,7 @@ def get_long_description(): # Sink plugins. "datahub-kafka": kafka_common, "datahub-rest": rest_common, + "sync-file-emitter": {"filelock"}, "datahub-lite": { "duckdb", "fastapi", @@ -288,8 +286,10 @@ def get_long_description(): # Misc plugins. "sql-parser": sqlglot_lib, # Source plugins - # PyAthena is pinned with exact version because we use private method in PyAthena - "athena": sql_common | {"PyAthena[SQLAlchemy]==2.4.1"}, + # sqlalchemy-bigquery is included here since it provides an implementation of + # a SQLalchemy-conform STRUCT type definition + "athena": sql_common + | {"PyAthena[SQLAlchemy]>=2.6.0,<3.0.0", "sqlalchemy-bigquery>=1.4.1"}, "azure-ad": set(), "bigquery": sql_common | bigquery_common @@ -361,9 +361,13 @@ def get_long_description(): | {"psycopg2-binary", "pymysql>=1.0.2"}, "pulsar": {"requests"}, "redash": {"redash-toolbelt", "sql-metadata"} | sqllineage_lib, - "redshift": sql_common | redshift_common | usage_common | {"redshift-connector"}, - "redshift-legacy": sql_common | redshift_common, - "redshift-usage-legacy": sql_common | usage_common | redshift_common, + "redshift": sql_common + | redshift_common + | usage_common + | {"redshift-connector"} + | sqlglot_lib, + "redshift-legacy": sql_common | redshift_common | sqlglot_lib, + "redshift-usage-legacy": sql_common | redshift_common | sqlglot_lib | usage_common, "s3": {*s3_base, *data_lake_profiling}, "gcs": {*s3_base, *data_lake_profiling}, "sagemaker": aws_common, @@ -380,12 +384,16 @@ def get_long_description(): # FIXME: I don't think tableau uses sqllineage anymore so we should be able # to remove that dependency. "tableau": {"tableauserverclient>=0.17.0"} | sqllineage_lib | sqlglot_lib, + "teradata": sql_common + | usage_common + | sqlglot_lib + | {"teradatasqlalchemy>=17.20.0.0"}, "trino": sql_common | trino, "starburst-trino-usage": sql_common | usage_common | trino, "nifi": {"requests", "packaging", "requests-gssapi"}, "powerbi": microsoft_common | {"lark[regex]==1.1.4", "sqlparse"} | sqlglot_lib, "powerbi-report-server": powerbi_report_server, - "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8"}, + "vertica": sql_common | {"vertica-sqlalchemy-dialect[vertica-python]==0.0.8.1"}, "unity-catalog": databricks | sqllineage_lib, } @@ -438,6 +446,10 @@ def get_long_description(): deepdiff_dep = "deepdiff" test_api_requirements = {pytest_dep, deepdiff_dep, "PyYAML"} +debug_requirements = { + "memray", +} + base_dev_requirements = { *base_requirements, *framework_common, @@ -502,6 +514,7 @@ def get_long_description(): "s3", "snowflake", "tableau", + "teradata", "trino", "hive", "starburst-trino-usage", @@ -600,6 +613,7 @@ def get_long_description(): "tableau = datahub.ingestion.source.tableau:TableauSource", "openapi = datahub.ingestion.source.openapi:OpenApiSource", "metabase = datahub.ingestion.source.metabase:MetabaseSource", + "teradata = datahub.ingestion.source.sql.teradata:TeradataSource", "trino = datahub.ingestion.source.sql.trino:TrinoSource", "starburst-trino-usage = datahub.ingestion.source.usage.starburst_trino_usage:TrinoUsageSource", "nifi = datahub.ingestion.source.nifi:NifiSource", @@ -667,10 +681,16 @@ def get_long_description(): "Documentation": "https://datahubproject.io/docs/", "Source": "https://github.com/datahub-project/datahub", "Changelog": "https://github.com/datahub-project/datahub/releases", + "Releases": "https://github.com/acryldata/datahub/releases", }, license="Apache License 2.0", description="A CLI to work with DataHub metadata", - long_description=get_long_description(), + long_description="""\ +The `acryl-datahub` package contains a CLI and SDK for interacting with DataHub, +as well as an integration framework for pulling/pushing metadata from external systems. + +See the [DataHub docs](https://datahubproject.io/docs/metadata-ingestion). +""", long_description_content_type="text/markdown", classifiers=[ "Development Status :: 5 - Production/Stable", @@ -725,5 +745,6 @@ def get_long_description(): "dev": list(dev_requirements), "testing-utils": list(test_api_requirements), # To import `datahub.testing` "integration-tests": list(full_test_dev_requirements), + "debug": list(debug_requirements), }, ) diff --git a/metadata-ingestion/src/datahub/api/entities/corpgroup/corpgroup.py b/metadata-ingestion/src/datahub/api/entities/corpgroup/corpgroup.py index 796786beba21bd..a898e35bb810ec 100644 --- a/metadata-ingestion/src/datahub/api/entities/corpgroup/corpgroup.py +++ b/metadata-ingestion/src/datahub/api/entities/corpgroup/corpgroup.py @@ -2,7 +2,7 @@ import logging from dataclasses import dataclass -from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union +from typing import Callable, Iterable, List, Optional, Union import pydantic from pydantic import BaseModel @@ -11,9 +11,10 @@ from datahub.api.entities.corpuser.corpuser import CorpUser, CorpUserGenerationConfig from datahub.configuration.common import ConfigurationError from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph +from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.schema_classes import ( CorpGroupEditableInfoClass, CorpGroupInfoClass, @@ -25,9 +26,6 @@ _Aspect, ) -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - logger = logging.getLogger(__name__) @@ -194,30 +192,9 @@ def generate_mcp( entityUrn=urn, aspect=StatusClass(removed=False) ) - @staticmethod - def _datahub_graph_from_datahub_rest_emitter( - rest_emitter: DatahubRestEmitter, - ) -> DataHubGraph: - """ - Create a datahub graph instance from a REST Emitter. - A stop-gap implementation which is expected to be removed after PATCH support is implemented - for membership updates for users <-> groups - """ - graph = DataHubGraph( - config=DatahubClientConfig( - server=rest_emitter._gms_server, - token=rest_emitter._token, - timeout_sec=rest_emitter._connect_timeout_sec, - retry_status_codes=rest_emitter._retry_status_codes, - extra_headers=rest_emitter._session.headers, - disable_ssl_verification=rest_emitter._session.verify is False, - ) - ) - return graph - def emit( self, - emitter: Union[DatahubRestEmitter, "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ @@ -235,7 +212,7 @@ def emit( # who are passing in a DataHubRestEmitter today # we won't need this in the future once PATCH support is implemented as all emitters # will work - datahub_graph = self._datahub_graph_from_datahub_rest_emitter(emitter) + datahub_graph = emitter.to_graph() for mcp in self.generate_mcp( generation_config=CorpGroupGenerationConfig( override_editable=self.overrideEditable, datahub_graph=datahub_graph diff --git a/metadata-ingestion/src/datahub/api/entities/corpuser/corpuser.py b/metadata-ingestion/src/datahub/api/entities/corpuser/corpuser.py index c67eb02a870a5d..9fe1ebedafca7e 100644 --- a/metadata-ingestion/src/datahub/api/entities/corpuser/corpuser.py +++ b/metadata-ingestion/src/datahub/api/entities/corpuser/corpuser.py @@ -1,14 +1,14 @@ from __future__ import annotations from dataclasses import dataclass -from typing import TYPE_CHECKING, Callable, Iterable, List, Optional, Union +from typing import Callable, Iterable, List, Optional import pydantic import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.metadata.schema_classes import ( CorpUserEditableInfoClass, CorpUserInfoClass, @@ -16,9 +16,6 @@ StatusClass, ) -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - @dataclass class CorpUserGenerationConfig: @@ -144,7 +141,7 @@ def generate_mcp( def emit( self, - emitter: Union[DatahubRestEmitter, "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/__init__.py b/metadata-ingestion/src/datahub/api/entities/datacontract/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py new file mode 100644 index 00000000000000..c45d4ddc924580 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion.py @@ -0,0 +1,7 @@ +from typing import Optional + +from datahub.configuration import ConfigModel + + +class BaseAssertion(ConfigModel): + description: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py new file mode 100644 index 00000000000000..a41b0f7aafd9f2 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/assertion_operator.py @@ -0,0 +1,162 @@ +from typing import Optional, Union + +from typing_extensions import Literal, Protocol + +from datahub.configuration import ConfigModel +from datahub.metadata.schema_classes import ( + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, +) + + +class Operator(Protocol): + """Specification for an assertion operator. + + This class exists only for documentation (not used in typing checking). + """ + + operator: str + + def id(self) -> str: + ... + + def generate_parameters(self) -> AssertionStdParametersClass: + ... + + +def _generate_assertion_std_parameter( + value: Union[str, int, float] +) -> AssertionStdParameterClass: + if isinstance(value, str): + return AssertionStdParameterClass( + value=value, type=AssertionStdParameterTypeClass.STRING + ) + elif isinstance(value, (int, float)): + return AssertionStdParameterClass( + value=str(value), type=AssertionStdParameterTypeClass.NUMBER + ) + else: + raise ValueError( + f"Unsupported assertion parameter {value} of type {type(value)}" + ) + + +Param = Union[str, int, float] + + +def _generate_assertion_std_parameters( + value: Optional[Param] = None, + min_value: Optional[Param] = None, + max_value: Optional[Param] = None, +) -> AssertionStdParametersClass: + return AssertionStdParametersClass( + value=_generate_assertion_std_parameter(value) if value else None, + minValue=_generate_assertion_std_parameter(min_value) if min_value else None, + maxValue=_generate_assertion_std_parameter(max_value) if max_value else None, + ) + + +class EqualToOperator(ConfigModel): + type: Literal["equal_to"] + value: Union[str, int, float] + + operator: str = AssertionStdOperatorClass.EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class BetweenOperator(ConfigModel): + type: Literal["between"] + min: Union[int, float] + max: Union[int, float] + + operator: str = AssertionStdOperatorClass.BETWEEN + + def id(self) -> str: + return f"{self.type}-{self.min}-{self.max}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters( + min_value=self.min, max_value=self.max + ) + + +class LessThanOperator(ConfigModel): + type: Literal["less_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOperator(ConfigModel): + type: Literal["greater_than"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class LessThanOrEqualToOperator(ConfigModel): + type: Literal["less_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.LESS_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class GreaterThanOrEqualToOperator(ConfigModel): + type: Literal["greater_than_or_equal_to"] + value: Union[int, float] + + operator: str = AssertionStdOperatorClass.GREATER_THAN_OR_EQUAL_TO + + def id(self) -> str: + return f"{self.type}-{self.value}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters(value=self.value) + + +class NotNullOperator(ConfigModel): + type: Literal["not_null"] + + operator: str = AssertionStdOperatorClass.NOT_NULL + + def id(self) -> str: + return f"{self.type}" + + def generate_parameters(self) -> AssertionStdParametersClass: + return _generate_assertion_std_parameters() + + +Operators = Union[ + EqualToOperator, + BetweenOperator, + LessThanOperator, + LessThanOrEqualToOperator, + GreaterThanOperator, + GreaterThanOrEqualToOperator, + NotNullOperator, +] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py new file mode 100644 index 00000000000000..6a3944ba36baf0 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/data_quality_assertion.py @@ -0,0 +1,115 @@ +from typing import List, Optional, Union + +import pydantic +from typing_extensions import Literal + +import datahub.emitter.mce_builder as builder +from datahub.api.entities.datacontract.assertion import BaseAssertion +from datahub.api.entities.datacontract.assertion_operator import Operators +from datahub.configuration.common import ConfigModel +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionStdAggregationClass, + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, + AssertionTypeClass, + AssertionValueChangeTypeClass, + DatasetAssertionInfoClass, + DatasetAssertionScopeClass, + SqlAssertionInfoClass, + SqlAssertionTypeClass, +) + + +class IdConfigMixin(BaseAssertion): + id_raw: Optional[str] = pydantic.Field( + default=None, + alias="id", + description="The id of the assertion. If not provided, one will be generated using the type.", + ) + + def generate_default_id(self) -> str: + raise NotImplementedError + + +class CustomSQLAssertion(IdConfigMixin, BaseAssertion): + type: Literal["custom_sql"] + sql: str + operator: Operators = pydantic.Field(discriminator="type") + + def generate_default_id(self) -> str: + return f"{self.type}-{self.sql}-{self.operator.id()}" + + def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass: + sql_assertion_info = SqlAssertionInfoClass( + entity=entity_urn, + statement=self.sql, + operator=self.operator.operator, + parameters=self.operator.generate_parameters(), + # TODO: Support other types of assertions + type=SqlAssertionTypeClass.METRIC, + changeType=AssertionValueChangeTypeClass.ABSOLUTE, + ) + return AssertionInfoClass( + type=AssertionTypeClass.SQL, + sqlAssertion=sql_assertion_info, + description=self.description, + ) + + +class ColumnUniqueAssertion(IdConfigMixin, BaseAssertion): + type: Literal["unique"] + + # TODO: support multiple columns? + column: str + + def generate_default_id(self) -> str: + return f"{self.type}-{self.column}" + + def generate_assertion_info(self, entity_urn: str) -> AssertionInfoClass: + dataset_assertion_info = DatasetAssertionInfoClass( + dataset=entity_urn, + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + fields=[builder.make_schema_field_urn(entity_urn, self.column)], + operator=AssertionStdOperatorClass.EQUAL_TO, + aggregation=AssertionStdAggregationClass.UNIQUE_PROPOTION, # purposely using the misspelled version to work with gql + parameters=AssertionStdParametersClass( + value=AssertionStdParameterClass( + value="1", type=AssertionStdParameterTypeClass.NUMBER + ) + ), + ) + return AssertionInfoClass( + type=AssertionTypeClass.DATASET, + datasetAssertion=dataset_assertion_info, + description=self.description, + ) + + +class DataQualityAssertion(ConfigModel): + __root__: Union[ + CustomSQLAssertion, + ColumnUniqueAssertion, + ] = pydantic.Field(discriminator="type") + + @property + def id(self) -> str: + if self.__root__.id_raw: + return self.__root__.id_raw + try: + return self.__root__.generate_default_id() + except NotImplementedError: + return self.__root__.type + + def generate_mcp( + self, assertion_urn: str, entity_urn: str + ) -> List[MetadataChangeProposalWrapper]: + return [ + MetadataChangeProposalWrapper( + entityUrn=assertion_urn, + aspect=self.__root__.generate_assertion_info(entity_urn), + ) + ] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py new file mode 100644 index 00000000000000..f3c6be55e5fea9 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/datacontract.py @@ -0,0 +1,213 @@ +import collections +from typing import Iterable, List, Optional, Tuple + +import pydantic +from ruamel.yaml import YAML +from typing_extensions import Literal + +import datahub.emitter.mce_builder as builder +from datahub.api.entities.datacontract.data_quality_assertion import ( + DataQualityAssertion, +) +from datahub.api.entities.datacontract.freshness_assertion import FreshnessAssertion +from datahub.api.entities.datacontract.schema_assertion import SchemaAssertion +from datahub.configuration.common import ConfigModel +from datahub.emitter.mce_builder import datahub_guid, make_assertion_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + DataContractPropertiesClass, + DataContractStateClass, + DataContractStatusClass, + DataQualityContractClass, + FreshnessContractClass, + SchemaContractClass, + StatusClass, +) +from datahub.utilities.urns.urn import guess_entity_type + + +class DataContract(ConfigModel): + """A yml representation of a Data Contract. + + This model is used as a simpler, Python-native representation of a DataHub data contract. + It can be easily parsed from a YAML file, and can be easily converted into series of MCPs + that can be emitted to DataHub. + """ + + version: Literal[1] + + id: Optional[str] = pydantic.Field( + default=None, + alias="urn", + description="The data contract urn. If not provided, one will be generated.", + ) + entity: str = pydantic.Field( + description="The entity urn that the Data Contract is associated with" + ) + # TODO: add support for properties + # properties: Optional[Dict[str, str]] = None + + schema_field: Optional[SchemaAssertion] = pydantic.Field( + default=None, alias="schema" + ) + + freshness: Optional[FreshnessAssertion] = pydantic.Field(default=None) + + # TODO: Add a validator to ensure that ids are unique + data_quality: Optional[List[DataQualityAssertion]] = pydantic.Field(default=None) + + _original_yaml_dict: Optional[dict] = None + + @pydantic.validator("data_quality") + def validate_data_quality( + cls, data_quality: Optional[List[DataQualityAssertion]] + ) -> Optional[List[DataQualityAssertion]]: + if data_quality: + # Raise an error if there are duplicate ids. + id_counts = collections.Counter(dq_check.id for dq_check in data_quality) + duplicates = [id for id, count in id_counts.items() if count > 1] + + if duplicates: + raise ValueError( + f"Got multiple data quality tests with the same type or ID: {duplicates}. Set a unique ID for each data quality test." + ) + + return data_quality + + @property + def urn(self) -> str: + if self.id: + assert guess_entity_type(self.id) == "dataContract" + return self.id + + # Data contract urns are stable + guid_obj = {"entity": self.entity} + urn = f"urn:li:dataContract:{datahub_guid(guid_obj)}" + return urn + + def _generate_freshness_assertion( + self, freshness: FreshnessAssertion + ) -> Tuple[str, List[MetadataChangeProposalWrapper]]: + guid_dict = { + "contract": self.urn, + "entity": self.entity, + "freshness": freshness.id, + } + assertion_urn = builder.make_assertion_urn(builder.datahub_guid(guid_dict)) + + return ( + assertion_urn, + freshness.generate_mcp(assertion_urn, self.entity), + ) + + def _generate_schema_assertion( + self, schema_metadata: SchemaAssertion + ) -> Tuple[str, List[MetadataChangeProposalWrapper]]: + # ingredients for guid -> the contract id, the fact that this is a schema assertion and the entity on which the assertion is made + guid_dict = { + "contract": self.urn, + "entity": self.entity, + "schema": schema_metadata.id, + } + assertion_urn = make_assertion_urn(datahub_guid(guid_dict)) + + return ( + assertion_urn, + schema_metadata.generate_mcp(assertion_urn, self.entity), + ) + + def _generate_data_quality_assertion( + self, data_quality: DataQualityAssertion + ) -> Tuple[str, List[MetadataChangeProposalWrapper]]: + guid_dict = { + "contract": self.urn, + "entity": self.entity, + "data_quality": data_quality.id, + } + assertion_urn = make_assertion_urn(datahub_guid(guid_dict)) + + return ( + assertion_urn, + data_quality.generate_mcp(assertion_urn, self.entity), + ) + + def _generate_dq_assertions( + self, data_quality_spec: List[DataQualityAssertion] + ) -> Tuple[List[str], List[MetadataChangeProposalWrapper]]: + assertion_urns = [] + assertion_mcps = [] + + for dq_check in data_quality_spec: + assertion_urn, assertion_mcp = self._generate_data_quality_assertion( + dq_check + ) + + assertion_urns.append(assertion_urn) + assertion_mcps.extend(assertion_mcp) + + return (assertion_urns, assertion_mcps) + + def generate_mcp( + self, + ) -> Iterable[MetadataChangeProposalWrapper]: + schema_assertion_urn = None + if self.schema_field is not None: + ( + schema_assertion_urn, + schema_assertion_mcps, + ) = self._generate_schema_assertion(self.schema_field) + yield from schema_assertion_mcps + + freshness_assertion_urn = None + if self.freshness: + ( + freshness_assertion_urn, + sla_assertion_mcps, + ) = self._generate_freshness_assertion(self.freshness) + yield from sla_assertion_mcps + + dq_assertions, dq_assertion_mcps = self._generate_dq_assertions( + self.data_quality or [] + ) + yield from dq_assertion_mcps + + # Now that we've generated the assertions, we can generate + # the actual data contract. + yield from MetadataChangeProposalWrapper.construct_many( + entityUrn=self.urn, + aspects=[ + DataContractPropertiesClass( + entity=self.entity, + schema=[SchemaContractClass(assertion=schema_assertion_urn)] + if schema_assertion_urn + else None, + freshness=[ + FreshnessContractClass(assertion=freshness_assertion_urn) + ] + if freshness_assertion_urn + else None, + dataQuality=[ + DataQualityContractClass(assertion=dq_assertion_urn) + for dq_assertion_urn in dq_assertions + ], + ), + # Also emit status. + StatusClass(removed=False), + # Emit the contract state as PENDING. + DataContractStatusClass(state=DataContractStateClass.PENDING) + if True + else None, + ], + ) + + @classmethod + def from_yaml( + cls, + file: str, + ) -> "DataContract": + with open(file) as fp: + yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip) + orig_dictionary = yaml.load(fp) + parsed_data_contract = DataContract.parse_obj(orig_dictionary) + parsed_data_contract._original_yaml_dict = orig_dictionary + return parsed_data_contract diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py new file mode 100644 index 00000000000000..71741d76b22fc4 --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/freshness_assertion.py @@ -0,0 +1,82 @@ +from __future__ import annotations + +from datetime import timedelta +from typing import List, Union + +import pydantic +from typing_extensions import Literal + +from datahub.api.entities.datacontract.assertion import BaseAssertion +from datahub.configuration.common import ConfigModel +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionTypeClass, + CalendarIntervalClass, + FixedIntervalScheduleClass, + FreshnessAssertionInfoClass, + FreshnessAssertionScheduleClass, + FreshnessAssertionScheduleTypeClass, + FreshnessAssertionTypeClass, + FreshnessCronScheduleClass, +) + + +class CronFreshnessAssertion(BaseAssertion): + type: Literal["cron"] + + cron: str = pydantic.Field( + description="The cron expression to use. See https://crontab.guru/ for help." + ) + timezone: str = pydantic.Field( + "UTC", + description="The timezone to use for the cron schedule. Defaults to UTC.", + ) + + def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleClass: + return FreshnessAssertionScheduleClass( + type=FreshnessAssertionScheduleTypeClass.CRON, + cron=FreshnessCronScheduleClass( + cron=self.cron, + timezone=self.timezone, + ), + ) + + +class FixedIntervalFreshnessAssertion(BaseAssertion): + type: Literal["interval"] + + interval: timedelta + + def generate_freshness_assertion_schedule(self) -> FreshnessAssertionScheduleClass: + return FreshnessAssertionScheduleClass( + type=FreshnessAssertionScheduleTypeClass.FIXED_INTERVAL, + fixedInterval=FixedIntervalScheduleClass( + unit=CalendarIntervalClass.SECOND, + multiple=int(self.interval.total_seconds()), + ), + ) + + +class FreshnessAssertion(ConfigModel): + __root__: Union[ + CronFreshnessAssertion, FixedIntervalFreshnessAssertion + ] = pydantic.Field(discriminator="type") + + @property + def id(self): + return self.__root__.type + + def generate_mcp( + self, assertion_urn: str, entity_urn: str + ) -> List[MetadataChangeProposalWrapper]: + aspect = AssertionInfoClass( + type=AssertionTypeClass.FRESHNESS, + freshnessAssertion=FreshnessAssertionInfoClass( + entity=entity_urn, + type=FreshnessAssertionTypeClass.DATASET_CHANGE, + schedule=self.__root__.generate_freshness_assertion_schedule(), + ), + description=self.__root__.description, + ) + return [MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=aspect)] diff --git a/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py new file mode 100644 index 00000000000000..b62f94e0592fce --- /dev/null +++ b/metadata-ingestion/src/datahub/api/entities/datacontract/schema_assertion.py @@ -0,0 +1,80 @@ +from __future__ import annotations + +import json +from typing import List, Union + +import pydantic +from typing_extensions import Literal + +from datahub.api.entities.datacontract.assertion import BaseAssertion +from datahub.configuration.common import ConfigModel +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.extractor.json_schema_util import get_schema_metadata +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionTypeClass, + SchemaAssertionInfoClass, + SchemaFieldClass, + SchemalessClass, + SchemaMetadataClass, +) + + +class JsonSchemaContract(BaseAssertion): + type: Literal["json-schema"] + + json_schema: dict = pydantic.Field(alias="json-schema") + + _schema_metadata: SchemaMetadataClass + + def _init_private_attributes(self) -> None: + super()._init_private_attributes() + self._schema_metadata = get_schema_metadata( + platform="urn:li:dataPlatform:datahub", + name="", + json_schema=self.json_schema, + raw_schema_string=json.dumps(self.json_schema), + ) + + +class FieldListSchemaContract(BaseAssertion, arbitrary_types_allowed=True): + type: Literal["field-list"] + + fields: List[SchemaFieldClass] + + _schema_metadata: SchemaMetadataClass + + def _init_private_attributes(self) -> None: + super()._init_private_attributes() + self._schema_metadata = SchemaMetadataClass( + schemaName="", + platform="urn:li:dataPlatform:datahub", + version=0, + hash="", + platformSchema=SchemalessClass(), + fields=self.fields, + ) + + +class SchemaAssertion(ConfigModel): + __root__: Union[JsonSchemaContract, FieldListSchemaContract] = pydantic.Field( + discriminator="type" + ) + + @property + def id(self): + return self.__root__.type + + def generate_mcp( + self, assertion_urn: str, entity_urn: str + ) -> List[MetadataChangeProposalWrapper]: + aspect = AssertionInfoClass( + type=AssertionTypeClass.DATA_SCHEMA, + schemaAssertion=SchemaAssertionInfoClass( + entity=entity_urn, + schema=self.__root__._schema_metadata, + ), + description=self.__root__.description, + ) + + return [MetadataChangeProposalWrapper(entityUrn=assertion_urn, aspect=aspect)] diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py index 8a04768bc0a721..acd708ee81a5c3 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/dataflow.py @@ -1,18 +1,9 @@ import logging from dataclasses import dataclass, field -from typing import ( - TYPE_CHECKING, - Callable, - Dict, - Iterable, - List, - Optional, - Set, - Union, - cast, -) +from typing import Callable, Dict, Iterable, List, Optional, Set, cast import datahub.emitter.mce_builder as builder +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AuditStampClass, @@ -29,10 +20,6 @@ ) from datahub.utilities.urns.data_flow_urn import DataFlowUrn -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - logger = logging.getLogger(__name__) @@ -170,7 +157,7 @@ def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: def emit( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ diff --git a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py index 7eb6fc8c8d1a92..0face6415bacc4 100644 --- a/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py +++ b/metadata-ingestion/src/datahub/api/entities/datajob/datajob.py @@ -1,16 +1,16 @@ from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Set, Union +from typing import Callable, Dict, Iterable, List, Optional, Set import datahub.emitter.mce_builder as builder +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.schema_classes import ( AuditStampClass, AzkabanJobTypeClass, DataJobInfoClass, DataJobInputOutputClass, - DataJobSnapshotClass, + FineGrainedLineageClass, GlobalTagsClass, - MetadataChangeEventClass, OwnerClass, OwnershipClass, OwnershipSourceClass, @@ -23,10 +23,6 @@ from datahub.utilities.urns.data_job_urn import DataJobUrn from datahub.utilities.urns.dataset_urn import DatasetUrn -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - @dataclass class DataJob: @@ -59,6 +55,7 @@ class DataJob: group_owners: Set[str] = field(default_factory=set) inlets: List[DatasetUrn] = field(default_factory=list) outlets: List[DatasetUrn] = field(default_factory=list) + fine_grained_lineages: List[FineGrainedLineageClass] = field(default_factory=list) upstream_urns: List[DataJobUrn] = field(default_factory=list) def __post_init__(self): @@ -103,31 +100,6 @@ def generate_tags_aspect(self) -> Iterable[GlobalTagsClass]: ) return [tags] - def generate_mce(self) -> MetadataChangeEventClass: - job_mce = MetadataChangeEventClass( - proposedSnapshot=DataJobSnapshotClass( - urn=str(self.urn), - aspects=[ - DataJobInfoClass( - name=self.name if self.name is not None else self.id, - type=AzkabanJobTypeClass.COMMAND, - description=self.description, - customProperties=self.properties, - externalUrl=self.url, - ), - DataJobInputOutputClass( - inputDatasets=[str(urn) for urn in self.inlets], - outputDatasets=[str(urn) for urn in self.outlets], - inputDatajobs=[str(urn) for urn in self.upstream_urns], - ), - *self.generate_ownership_aspect(), - *self.generate_tags_aspect(), - ], - ) - ) - - return job_mce - def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: mcp = MetadataChangeProposalWrapper( entityUrn=str(self.urn), @@ -159,7 +131,7 @@ def generate_mcp(self) -> Iterable[MetadataChangeProposalWrapper]: def emit( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ @@ -179,6 +151,7 @@ def generate_data_input_output_mcp(self) -> Iterable[MetadataChangeProposalWrapp inputDatasets=[str(urn) for urn in self.inlets], outputDatasets=[str(urn) for urn in self.outlets], inputDatajobs=[str(urn) for urn in self.upstream_urns], + fineGrainedLineages=self.fine_grained_lineages, ), ) yield mcp diff --git a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py index 9ec389c3a09890..cf6080c7072e69 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py +++ b/metadata-ingestion/src/datahub/api/entities/dataprocess/dataprocess_instance.py @@ -1,9 +1,10 @@ import time from dataclasses import dataclass, field from enum import Enum -from typing import TYPE_CHECKING, Callable, Dict, Iterable, List, Optional, Union, cast +from typing import Callable, Dict, Iterable, List, Optional, Union, cast from datahub.api.entities.datajob import DataFlow, DataJob +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import DatahubKey from datahub.metadata.com.linkedin.pegasus2avro.dataprocess import ( @@ -26,10 +27,6 @@ from datahub.utilities.urns.data_process_instance_urn import DataProcessInstanceUrn from datahub.utilities.urns.dataset_urn import DatasetUrn -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - from datahub.emitter.rest_emitter import DatahubRestEmitter - class DataProcessInstanceKey(DatahubKey): cluster: str @@ -106,7 +103,7 @@ def start_event_mcp( def emit_process_start( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, start_timestamp_millis: int, attempt: Optional[int] = None, emit_template: bool = True, @@ -197,7 +194,7 @@ def end_event_mcp( def emit_process_end( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, end_timestamp_millis: int, result: InstanceRunResult, result_type: Optional[str] = None, @@ -207,7 +204,7 @@ def emit_process_end( """ Generate an DataProcessInstance finish event and emits is - :param emitter: (Union[DatahubRestEmitter, DatahubKafkaEmitter]) the datahub emitter to emit generated mcps + :param emitter: (Emitter) the datahub emitter to emit generated mcps :param end_timestamp_millis: (int) the end time of the execution in milliseconds :param result: (InstanceRunResult) The result of the run :param result_type: (string) It identifies the system where the native result comes from like Airflow, Azkaban @@ -261,24 +258,24 @@ def generate_mcp( @staticmethod def _emit_mcp( mcp: MetadataChangeProposalWrapper, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ - :param emitter: (Union[DatahubRestEmitter, DatahubKafkaEmitter]) the datahub emitter to emit generated mcps + :param emitter: (Emitter) the datahub emitter to emit generated mcps :param callback: (Optional[Callable[[Exception, str], None]]) the callback method for KafkaEmitter if it is used """ emitter.emit(mcp, callback) def emit( self, - emitter: Union["DatahubRestEmitter", "DatahubKafkaEmitter"], + emitter: Emitter, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: """ - :param emitter: (Union[DatahubRestEmitter, DatahubKafkaEmitter]) the datahub emitter to emit generated mcps + :param emitter: (Emitter) the datahub emitter to emit generated mcps :param callback: (Optional[Callable[[Exception, str], None]]) the callback method for KafkaEmitter if it is used """ for mcp in self.generate_mcp(): diff --git a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py index 04f12b4f61d1e1..28e4a03b8f75f7 100644 --- a/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py +++ b/metadata-ingestion/src/datahub/api/entities/dataproduct/dataproduct.py @@ -2,25 +2,15 @@ import time from pathlib import Path -from typing import ( - TYPE_CHECKING, - Any, - Callable, - Dict, - Iterable, - List, - Optional, - Tuple, - Union, -) +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import pydantic from ruamel.yaml import YAML import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.ingestion.graph.client import DataHubGraph from datahub.metadata.schema_classes import ( AuditStampClass, @@ -43,9 +33,6 @@ from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.urns.urn import Urn -if TYPE_CHECKING: - from datahub.emitter.kafka_emitter import DatahubKafkaEmitter - def patch_list( orig_list: Optional[list], @@ -117,7 +104,7 @@ class DataProduct(ConfigModel): id: str domain: str - _resolved_domain_urn: Optional[str] + _resolved_domain_urn: Optional[str] = None assets: Optional[List[str]] = None display_name: Optional[str] = None owners: Optional[List[Union[str, Ownership]]] = None @@ -225,7 +212,6 @@ def _generate_properties_mcp( def generate_mcp( self, upsert: bool ) -> Iterable[Union[MetadataChangeProposalWrapper, MetadataChangeProposalClass]]: - if self._resolved_domain_urn is None: raise Exception( f"Unable to generate MCP-s because we were unable to resolve the domain {self.domain} to an urn." @@ -282,7 +268,7 @@ def generate_mcp( def emit( self, - emitter: Union[DatahubRestEmitter, "DatahubKafkaEmitter"], + emitter: Emitter, upsert: bool, callback: Optional[Callable[[Exception, str], None]] = None, ) -> None: @@ -440,7 +426,6 @@ def patch_yaml( original_dataproduct: DataProduct, output_file: Path, ) -> bool: - update_needed = False if not original_dataproduct._original_yaml_dict: raise Exception("Original Data Product was not loaded from yaml") @@ -523,7 +508,6 @@ def to_yaml( self, file: Path, ) -> None: - with open(file, "w") as fp: yaml = YAML(typ="rt") # default, if not specfied, is 'rt' (round-trip) yaml.indent(mapping=2, sequence=4, offset=2) diff --git a/metadata-ingestion/src/datahub/cli/docker_cli.py b/metadata-ingestion/src/datahub/cli/docker_cli.py index 4afccfe711e347..77e3285d359efc 100644 --- a/metadata-ingestion/src/datahub/cli/docker_cli.py +++ b/metadata-ingestion/src/datahub/cli/docker_cli.py @@ -5,6 +5,7 @@ import os import pathlib import platform +import signal import subprocess import sys import tempfile @@ -770,6 +771,10 @@ def quickstart( # noqa: C901 logger.debug("docker compose up still running, sending SIGKILL") up_process.kill() up_process.wait() + else: + # If the docker process got a keyboard interrupt, raise one here. + if up_process.returncode in {128 + signal.SIGINT, -signal.SIGINT}: + raise KeyboardInterrupt # Check docker health every few seconds. status = check_docker_quickstart() diff --git a/metadata-ingestion/src/datahub/cli/specific/datacontract_cli.py b/metadata-ingestion/src/datahub/cli/specific/datacontract_cli.py new file mode 100644 index 00000000000000..3745943c8c96ad --- /dev/null +++ b/metadata-ingestion/src/datahub/cli/specific/datacontract_cli.py @@ -0,0 +1,80 @@ +import logging +from typing import Optional + +import click +from click_default_group import DefaultGroup + +from datahub.api.entities.datacontract.datacontract import DataContract +from datahub.ingestion.graph.client import get_default_graph +from datahub.telemetry import telemetry +from datahub.upgrade import upgrade + +logger = logging.getLogger(__name__) + + +@click.group(cls=DefaultGroup, default="upsert") +def datacontract() -> None: + """A group of commands to interact with the DataContract entity in DataHub.""" + pass + + +@datacontract.command() +@click.option("-f", "--file", required=True, type=click.Path(exists=True)) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def upsert(file: str) -> None: + """Upsert (create or update) a Data Contract in DataHub.""" + + data_contract: DataContract = DataContract.from_yaml(file) + urn = data_contract.urn + + with get_default_graph() as graph: + if not graph.exists(data_contract.entity): + raise ValueError( + f"Cannot define a data contract for non-existent entity {data_contract.entity}" + ) + + try: + for mcp in data_contract.generate_mcp(): + graph.emit(mcp) + click.secho(f"Update succeeded for urn {urn}.", fg="green") + except Exception as e: + logger.exception(e) + click.secho( + f"Update failed for {urn}: {e}", + fg="red", + ) + + +@datacontract.command() +@click.option( + "--urn", required=False, type=str, help="The urn for the data contract to delete" +) +@click.option( + "-f", + "--file", + required=False, + type=click.Path(exists=True), + help="The file containing the data contract definition", +) +@click.option("--hard/--soft", required=False, is_flag=True, default=False) +@upgrade.check_upgrade +@telemetry.with_telemetry() +def delete(urn: Optional[str], file: Optional[str], hard: bool) -> None: + """Delete a Data Contract in DataHub. Defaults to a soft-delete. Use --hard to completely erase metadata.""" + + if not urn: + if not file: + raise click.UsageError( + "Must provide either an urn or a file to delete a data contract" + ) + + data_contract = DataContract.from_yaml(file) + urn = data_contract.urn + + with get_default_graph() as graph: + if not graph.exists(urn): + raise ValueError(f"Data Contract {urn} does not exist") + + graph.delete_entity(urn, hard=hard) + click.secho(f"Data Contract {urn} deleted") diff --git a/metadata-ingestion/src/datahub/cli/specific/file_loader.py b/metadata-ingestion/src/datahub/cli/specific/file_loader.py index 54f12e024d2948..a9787343fdb911 100644 --- a/metadata-ingestion/src/datahub/cli/specific/file_loader.py +++ b/metadata-ingestion/src/datahub/cli/specific/file_loader.py @@ -1,9 +1,7 @@ -import io from pathlib import Path from typing import Union -from datahub.configuration.common import ConfigurationError -from datahub.configuration.yaml import YamlConfigurationMechanism +from datahub.configuration.config_loader import load_config_file def load_file(config_file: Path) -> Union[dict, list]: @@ -17,19 +15,11 @@ def load_file(config_file: Path) -> Union[dict, list]: evolve to becoming a standard function that all the specific. cli variants will use to load up the models from external files """ - if not isinstance(config_file, Path): - config_file = Path(config_file) - if not config_file.is_file(): - raise ConfigurationError(f"Cannot open config file {config_file}") - if config_file.suffix in {".yaml", ".yml"}: - config_mech: YamlConfigurationMechanism = YamlConfigurationMechanism() - else: - raise ConfigurationError( - f"Only .yaml and .yml are supported. Cannot process file type {config_file.suffix}" - ) - - raw_config_file = config_file.read_text() - config_fp = io.StringIO(raw_config_file) - raw_config = config_mech.load_config(config_fp) - return raw_config + res = load_config_file( + config_file, + squirrel_original_config=False, + resolve_env_vars=False, + allow_stdin=False, + ) + return res diff --git a/metadata-ingestion/src/datahub/cli/specific/group_cli.py b/metadata-ingestion/src/datahub/cli/specific/group_cli.py index 9baa8ee68d9758..e313fce33d4d57 100644 --- a/metadata-ingestion/src/datahub/cli/specific/group_cli.py +++ b/metadata-ingestion/src/datahub/cli/specific/group_cli.py @@ -43,7 +43,7 @@ def upsert(file: Path, override_editable: bool) -> None: with get_default_graph() as emitter: for group_config in group_configs: try: - datahub_group = CorpGroup.parse_obj(config_dict) + datahub_group = CorpGroup.parse_obj(group_config) for mcp in datahub_group.generate_mcp( generation_config=CorpGroupGenerationConfig( override_editable=override_editable, datahub_graph=emitter diff --git a/metadata-ingestion/src/datahub/configuration/common.py b/metadata-ingestion/src/datahub/configuration/common.py index c909b89eb0c2dd..73ac4baac48c0f 100644 --- a/metadata-ingestion/src/datahub/configuration/common.py +++ b/metadata-ingestion/src/datahub/configuration/common.py @@ -283,7 +283,7 @@ class VersionedConfig(ConfigModel): class LineageConfig(ConfigModel): incremental_lineage: bool = Field( - default=True, + default=False, description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.", ) diff --git a/metadata-ingestion/src/datahub/configuration/source_common.py b/metadata-ingestion/src/datahub/configuration/source_common.py index 37b93f3e598e1a..80b6ceb576c1cc 100644 --- a/metadata-ingestion/src/datahub/configuration/source_common.py +++ b/metadata-ingestion/src/datahub/configuration/source_common.py @@ -4,7 +4,7 @@ from pydantic.fields import Field from datahub.configuration.common import ConfigModel, ConfigurationError -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.metadata.schema_classes import FabricTypeClass DEFAULT_ENV = FabricTypeClass.PROD @@ -54,6 +54,13 @@ class DatasetSourceConfigMixin(PlatformInstanceConfigMixin, EnvConfigMixin): """ +class LowerCaseDatasetUrnConfigMixin(ConfigModel): + convert_urns_to_lowercase: bool = Field( + default=False, + description="Whether to convert dataset urns to lowercase.", + ) + + class DatasetLineageProviderConfigBase(EnvConfigMixin): """ Any non-Dataset source that produces lineage to Datasets should inherit this class. diff --git a/metadata-ingestion/src/datahub/configuration/pydantic_field_deprecation.py b/metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py similarity index 74% rename from metadata-ingestion/src/datahub/configuration/pydantic_field_deprecation.py rename to metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py index ed82acb594ed7c..6134c4dab48174 100644 --- a/metadata-ingestion/src/datahub/configuration/pydantic_field_deprecation.py +++ b/metadata-ingestion/src/datahub/configuration/validate_field_deprecation.py @@ -1,20 +1,28 @@ import warnings -from typing import Optional, Type +from typing import Any, Optional, Type import pydantic from datahub.configuration.common import ConfigurationWarning from datahub.utilities.global_warning_util import add_global_warning +_unset = object() -def pydantic_field_deprecated(field: str, message: Optional[str] = None) -> classmethod: + +def pydantic_field_deprecated( + field: str, + warn_if_value_is_not: Any = _unset, + message: Optional[str] = None, +) -> classmethod: if message: output = message else: output = f"{field} is deprecated and will be removed in a future release. Please remove it from your config." def _validate_deprecated(cls: Type, values: dict) -> dict: - if field in values: + if field in values and ( + warn_if_value_is_not is _unset or values[field] != warn_if_value_is_not + ): add_global_warning(output) warnings.warn(output, ConfigurationWarning, stacklevel=2) return values diff --git a/metadata-ingestion/src/datahub/emitter/generic_emitter.py b/metadata-ingestion/src/datahub/emitter/generic_emitter.py new file mode 100644 index 00000000000000..28138c61827583 --- /dev/null +++ b/metadata-ingestion/src/datahub/emitter/generic_emitter.py @@ -0,0 +1,31 @@ +from typing import Any, Callable, Optional, Union + +from typing_extensions import Protocol + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( + MetadataChangeEvent, + MetadataChangeProposal, +) + + +class Emitter(Protocol): + def emit( + self, + item: Union[ + MetadataChangeEvent, + MetadataChangeProposal, + MetadataChangeProposalWrapper, + ], + # NOTE: This signature should have the exception be optional rather than + # required. However, this would be a breaking change that may need + # more careful consideration. + callback: Optional[Callable[[Exception, str], None]] = None, + # TODO: The rest emitter returns timestamps as the return type. For now + # we smooth over that detail using Any, but eventually we should + # standardize on a return type. + ) -> Any: + raise NotImplementedError + + def flush(self) -> None: + pass diff --git a/metadata-ingestion/src/datahub/emitter/kafka_emitter.py b/metadata-ingestion/src/datahub/emitter/kafka_emitter.py index ec0c8f3418a4aa..781930011b78fb 100644 --- a/metadata-ingestion/src/datahub/emitter/kafka_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/kafka_emitter.py @@ -10,6 +10,7 @@ from datahub.configuration.common import ConfigModel from datahub.configuration.kafka import KafkaProducerConnectionConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.closeable import Closeable from datahub.metadata.schema_classes import ( @@ -55,7 +56,7 @@ def validate_topic_routes(cls, v: Dict[str, str]) -> Dict[str, str]: return v -class DatahubKafkaEmitter(Closeable): +class DatahubKafkaEmitter(Closeable, Emitter): def __init__(self, config: KafkaEmitterConfig): self.config = config schema_registry_conf = { diff --git a/metadata-ingestion/src/datahub/emitter/mce_builder.py b/metadata-ingestion/src/datahub/emitter/mce_builder.py index 0928818c7005c7..64c9ec1bb5704d 100644 --- a/metadata-ingestion/src/datahub/emitter/mce_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mce_builder.py @@ -1,11 +1,11 @@ """Convenience functions for creating MCEs""" +import hashlib import json import logging import os import re import time from enum import Enum -from hashlib import md5 from typing import ( TYPE_CHECKING, Any, @@ -21,7 +21,6 @@ import typing_inspect from datahub.configuration.source_common import DEFAULT_ENV as DEFAULT_ENV_CONFIGURATION -from datahub.emitter.serialization_helper import pre_json_transform from datahub.metadata.schema_classes import ( AssertionKeyClass, AuditStampClass, @@ -159,11 +158,24 @@ def container_urn_to_key(guid: str) -> Optional[ContainerKeyClass]: return None +class _DatahubKeyJSONEncoder(json.JSONEncoder): + # overload method default + def default(self, obj: Any) -> Any: + if hasattr(obj, "guid"): + return obj.guid() + # Call the default method for other types + return json.JSONEncoder.default(self, obj) + + def datahub_guid(obj: dict) -> str: - obj_str = json.dumps( - pre_json_transform(obj), separators=(",", ":"), sort_keys=True - ).encode("utf-8") - return md5(obj_str).hexdigest() + json_key = json.dumps( + obj, + separators=(",", ":"), + sort_keys=True, + cls=_DatahubKeyJSONEncoder, + ) + md5_hash = hashlib.md5(json_key.encode("utf-8")) + return str(md5_hash.hexdigest()) def make_assertion_urn(assertion_id: str) -> str: diff --git a/metadata-ingestion/src/datahub/emitter/mcp_builder.py b/metadata-ingestion/src/datahub/emitter/mcp_builder.py index 844a29f1c78a35..d50feba8b119c8 100644 --- a/metadata-ingestion/src/datahub/emitter/mcp_builder.py +++ b/metadata-ingestion/src/datahub/emitter/mcp_builder.py @@ -1,14 +1,15 @@ -import hashlib -import json -from typing import Any, Dict, Iterable, List, Optional, TypeVar +from typing import Dict, Iterable, List, Optional, Type, TypeVar from pydantic.fields import Field from pydantic.main import BaseModel from datahub.emitter.mce_builder import ( + Aspect, + datahub_guid, make_container_urn, make_data_platform_urn, make_dataplatform_instance_urn, + make_dataset_urn_with_platform_instance, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -18,6 +19,7 @@ ) from datahub.metadata.com.linkedin.pegasus2avro.container import ContainerProperties from datahub.metadata.schema_classes import ( + KEY_ASPECTS, ContainerClass, DomainsClass, EmbedClass, @@ -32,24 +34,13 @@ ) -def _stable_guid_from_dict(d: dict) -> str: - json_key = json.dumps( - d, - separators=(",", ":"), - sort_keys=True, - cls=DatahubKeyJSONEncoder, - ) - md5_hash = hashlib.md5(json_key.encode("utf-8")) - return str(md5_hash.hexdigest()) - - class DatahubKey(BaseModel): def guid_dict(self) -> Dict[str, str]: return self.dict(by_alias=True, exclude_none=True) def guid(self) -> str: bag = self.guid_dict() - return _stable_guid_from_dict(bag) + return datahub_guid(bag) class ContainerKey(DatahubKey): @@ -105,7 +96,15 @@ class MetastoreKey(ContainerKey): metastore: str -class CatalogKey(MetastoreKey): +class CatalogKeyWithMetastore(MetastoreKey): + catalog: str + + +class UnitySchemaKeyWithMetastore(CatalogKeyWithMetastore): + unity_schema: str + + +class CatalogKey(ContainerKey): catalog: str @@ -125,13 +124,15 @@ class BucketKey(ContainerKey): bucket_name: str -class DatahubKeyJSONEncoder(json.JSONEncoder): - # overload method default - def default(self, obj: Any) -> Any: - if hasattr(obj, "guid"): - return obj.guid() - # Call the default method for other types - return json.JSONEncoder.default(self, obj) +class NotebookKey(DatahubKey): + notebook_id: int + platform: str + instance: Optional[str] + + def as_urn(self) -> str: + return make_dataset_urn_with_platform_instance( + platform=self.platform, platform_instance=self.instance, name=self.guid() + ) KeyType = TypeVar("KeyType", bound=ContainerKey) @@ -307,3 +308,12 @@ def create_embed_mcp(urn: str, embed_url: str) -> MetadataChangeProposalWrapper: entityUrn=urn, aspect=EmbedClass(renderUrl=embed_url), ) + + +def entity_supports_aspect(entity_type: str, aspect_type: Type[Aspect]) -> bool: + entity_key_aspect = KEY_ASPECTS[entity_type] + aspect_name = aspect_type.get_aspect_name() + + supported_aspects = entity_key_aspect.ASPECT_INFO["entityAspects"] + + return aspect_name in supported_aspects diff --git a/metadata-ingestion/src/datahub/emitter/rest_emitter.py b/metadata-ingestion/src/datahub/emitter/rest_emitter.py index 937e0902d6d8c7..afb19df9791af3 100644 --- a/metadata-ingestion/src/datahub/emitter/rest_emitter.py +++ b/metadata-ingestion/src/datahub/emitter/rest_emitter.py @@ -4,7 +4,7 @@ import logging import os from json.decoder import JSONDecodeError -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union import requests from deprecated import deprecated @@ -13,6 +13,7 @@ from datahub.cli.cli_utils import get_system_auth from datahub.configuration.common import ConfigurationError, OperationalError +from datahub.emitter.generic_emitter import Emitter from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.request_helper import make_curl_command from datahub.emitter.serialization_helper import pre_json_transform @@ -23,6 +24,9 @@ ) from datahub.metadata.com.linkedin.pegasus2avro.usage import UsageAggregation +if TYPE_CHECKING: + from datahub.ingestion.graph.client import DataHubGraph + logger = logging.getLogger(__name__) _DEFAULT_CONNECT_TIMEOUT_SEC = 30 # 30 seconds should be plenty to connect @@ -42,7 +46,7 @@ ) -class DataHubRestEmitter(Closeable): +class DataHubRestEmitter(Closeable, Emitter): _gms_server: str _token: Optional[str] _session: requests.Session @@ -190,6 +194,11 @@ def test_connection(self) -> dict: message += "\nPlease check your configuration and make sure you are talking to the DataHub GMS (usually :8080) or Frontend GMS API (usually :9002/api/gms)." raise ConfigurationError(message) + def to_graph(self) -> "DataHubGraph": + from datahub.ingestion.graph.client import DataHubGraph + + return DataHubGraph.from_emitter(self) + def emit( self, item: Union[ @@ -198,9 +207,6 @@ def emit( MetadataChangeProposalWrapper, UsageAggregation, ], - # NOTE: This signature should have the exception be optional rather than - # required. However, this would be a breaking change that may need - # more careful consideration. callback: Optional[Callable[[Exception, str], None]] = None, ) -> Tuple[datetime.datetime, datetime.datetime]: start_time = datetime.datetime.now() diff --git a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py index 071d590f270f8b..cedaa4fbbd7f6f 100644 --- a/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py +++ b/metadata-ingestion/src/datahub/emitter/sql_parsing_builder.py @@ -106,6 +106,7 @@ def process_sql_parsing_result( user: Optional[UserUrn] = None, custom_operation_type: Optional[str] = None, include_urns: Optional[Set[DatasetUrn]] = None, + include_column_lineage: bool = True, ) -> Iterable[MetadataWorkUnit]: """Process a single query and yield any generated workunits. @@ -130,7 +131,9 @@ def process_sql_parsing_result( _merge_lineage_data( downstream_urn=downstream_urn, upstream_urns=result.in_tables, - column_lineage=result.column_lineage, + column_lineage=result.column_lineage + if include_column_lineage + else None, upstream_edges=self._lineage_map[downstream_urn], query_timestamp=query_timestamp, is_view_ddl=is_view_ddl, @@ -179,15 +182,16 @@ def add_lineage( def gen_workunits(self) -> Iterable[MetadataWorkUnit]: if self.generate_lineage: - yield from self._gen_lineage_workunits() + for mcp in self._gen_lineage_mcps(): + yield mcp.as_workunit() if self.generate_usage_statistics: yield from self._gen_usage_statistics_workunits() - def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]: + def _gen_lineage_mcps(self) -> Iterable[MetadataChangeProposalWrapper]: for downstream_urn in self._lineage_map: upstreams: List[UpstreamClass] = [] fine_upstreams: List[FineGrainedLineageClass] = [] - for upstream_urn, edge in self._lineage_map[downstream_urn].items(): + for edge in self._lineage_map[downstream_urn].values(): upstreams.append(edge.gen_upstream_aspect()) fine_upstreams.extend(edge.gen_fine_grained_lineage_aspects()) @@ -201,7 +205,7 @@ def _gen_lineage_workunits(self) -> Iterable[MetadataWorkUnit]: ) yield MetadataChangeProposalWrapper( entityUrn=downstream_urn, aspect=upstream_lineage - ).as_workunit() + ) def _gen_usage_statistics_workunits(self) -> Iterable[MetadataWorkUnit]: yield from self._usage_aggregator.generate_workunits( diff --git a/metadata-ingestion/src/datahub/emitter/synchronized_file_emitter.py b/metadata-ingestion/src/datahub/emitter/synchronized_file_emitter.py new file mode 100644 index 00000000000000..f82882f1a87cc3 --- /dev/null +++ b/metadata-ingestion/src/datahub/emitter/synchronized_file_emitter.py @@ -0,0 +1,60 @@ +import logging +import pathlib +from typing import Callable, Optional, Union + +import filelock + +from datahub.emitter.generic_emitter import Emitter +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.closeable import Closeable +from datahub.ingestion.sink.file import write_metadata_file +from datahub.ingestion.source.file import read_metadata_file +from datahub.metadata.com.linkedin.pegasus2avro.mxe import ( + MetadataChangeEvent, + MetadataChangeProposal, +) + +logger = logging.getLogger(__name__) + + +class SynchronizedFileEmitter(Closeable, Emitter): + """ + A multiprocessing-safe emitter that writes to a file. + + This emitter is intended for testing purposes only. It is not performant + because it reads and writes the full file on every emit call to ensure + that the file is always valid JSON. + """ + + def __init__(self, filename: str) -> None: + self._filename = pathlib.Path(filename) + self._lock = filelock.FileLock(self._filename.with_suffix(".lock")) + + def emit( + self, + item: Union[ + MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper + ], + callback: Optional[Callable[[Exception, str], None]] = None, + ) -> None: + with self._lock: + if self._filename.exists(): + metadata = list(read_metadata_file(self._filename)) + else: + metadata = [] + + logger.debug("Emitting metadata: %s", item) + metadata.append(item) + + write_metadata_file(self._filename, metadata) + + def __repr__(self) -> str: + return f"SynchronizedFileEmitter('{self._filename}')" + + def flush(self) -> None: + # No-op. + pass + + def close(self) -> None: + # No-op. + pass diff --git a/metadata-ingestion/src/datahub/entrypoints.py b/metadata-ingestion/src/datahub/entrypoints.py index 84615fd9a6148a..5bfab3b841fa38 100644 --- a/metadata-ingestion/src/datahub/entrypoints.py +++ b/metadata-ingestion/src/datahub/entrypoints.py @@ -21,6 +21,7 @@ from datahub.cli.ingest_cli import ingest from datahub.cli.migrate import migrate from datahub.cli.put_cli import put +from datahub.cli.specific.datacontract_cli import datacontract from datahub.cli.specific.dataproduct_cli import dataproduct from datahub.cli.specific.group_cli import group from datahub.cli.specific.user_cli import user @@ -158,6 +159,7 @@ def init() -> None: datahub.add_command(user) datahub.add_command(group) datahub.add_command(dataproduct) +datahub.add_command(datacontract) try: from datahub.cli.lite_cli import lite diff --git a/metadata-ingestion/src/datahub/ingestion/api/closeable.py b/metadata-ingestion/src/datahub/ingestion/api/closeable.py index 523174b9978b3c..80a5008ed63683 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/closeable.py +++ b/metadata-ingestion/src/datahub/ingestion/api/closeable.py @@ -1,7 +1,9 @@ from abc import abstractmethod from contextlib import AbstractContextManager from types import TracebackType -from typing import Optional, Type +from typing import Optional, Type, TypeVar + +_Self = TypeVar("_Self", bound="Closeable") class Closeable(AbstractContextManager): @@ -9,6 +11,10 @@ class Closeable(AbstractContextManager): def close(self) -> None: pass + def __enter__(self: _Self) -> _Self: + # This method is mainly required for type checking. + return self + def __exit__( self, exc_type: Optional[Type[BaseException]], diff --git a/metadata-ingestion/src/datahub/ingestion/api/common.py b/metadata-ingestion/src/datahub/ingestion/api/common.py index 778bd119615e27..a6761a3c77d5e8 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/common.py +++ b/metadata-ingestion/src/datahub/ingestion/api/common.py @@ -2,6 +2,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, Generic, Iterable, Optional, Tuple, TypeVar +from datahub.configuration.common import ConfigurationError from datahub.emitter.mce_builder import set_dataset_urn_to_lower from datahub.ingestion.api.committable import Committable from datahub.ingestion.graph.client import DataHubGraph @@ -75,3 +76,11 @@ def register_checkpointer(self, committable: Committable) -> None: def get_committables(self) -> Iterable[Tuple[str, Committable]]: yield from self.checkpointers.items() + + def require_graph(self, operation: Optional[str] = None) -> DataHubGraph: + if not self.graph: + raise ConfigurationError( + f"{operation or 'This operation'} requires a graph, but none was provided. " + "To provide one, either use the datahub-rest sink or set the top-level datahub_api config in the recipe." + ) + return self.graph diff --git a/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py new file mode 100644 index 00000000000000..945b201ca5758c --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/api/incremental_lineage_helper.py @@ -0,0 +1,142 @@ +import copy +from typing import Dict, Iterable, Optional + +from datahub.emitter.mce_builder import datahub_guid, set_aspect +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph +from datahub.metadata.schema_classes import ( + FineGrainedLineageClass, + MetadataChangeEventClass, + SystemMetadataClass, + UpstreamClass, + UpstreamLineageClass, +) +from datahub.specific.dataset import DatasetPatchBuilder + + +def _convert_upstream_lineage_to_patch( + urn: str, + aspect: UpstreamLineageClass, + system_metadata: Optional[SystemMetadataClass], +) -> MetadataWorkUnit: + patch_builder = DatasetPatchBuilder(urn, system_metadata) + for upstream in aspect.upstreams: + patch_builder.add_upstream_lineage(upstream) + mcp = next(iter(patch_builder.build())) + return MetadataWorkUnit(id=f"{urn}-upstreamLineage", mcp_raw=mcp) + + +def get_fine_grained_lineage_key(fine_upstream: FineGrainedLineageClass) -> str: + return datahub_guid( + { + "upstreams": sorted(fine_upstream.upstreams or []), + "downstreams": sorted(fine_upstream.downstreams or []), + "transformOperation": fine_upstream.transformOperation, + } + ) + + +def _merge_upstream_lineage( + new_aspect: UpstreamLineageClass, gms_aspect: UpstreamLineageClass +) -> UpstreamLineageClass: + merged_aspect = copy.deepcopy(gms_aspect) + + upstreams_map: Dict[str, UpstreamClass] = { + upstream.dataset: upstream for upstream in merged_aspect.upstreams + } + + upstreams_updated = False + fine_upstreams_updated = False + + for table_upstream in new_aspect.upstreams: + if table_upstream.dataset not in upstreams_map or ( + table_upstream.auditStamp.time + > upstreams_map[table_upstream.dataset].auditStamp.time + ): + upstreams_map[table_upstream.dataset] = table_upstream + upstreams_updated = True + + if upstreams_updated: + merged_aspect.upstreams = list(upstreams_map.values()) + + if new_aspect.fineGrainedLineages and merged_aspect.fineGrainedLineages: + fine_upstreams_map: Dict[str, FineGrainedLineageClass] = { + get_fine_grained_lineage_key(fine_upstream): fine_upstream + for fine_upstream in merged_aspect.fineGrainedLineages + } + for column_upstream in new_aspect.fineGrainedLineages: + column_upstream_key = get_fine_grained_lineage_key(column_upstream) + + if column_upstream_key not in fine_upstreams_map or ( + column_upstream.confidenceScore + > fine_upstreams_map[column_upstream_key].confidenceScore + ): + fine_upstreams_map[column_upstream_key] = column_upstream + fine_upstreams_updated = True + + if fine_upstreams_updated: + merged_aspect.fineGrainedLineages = list(fine_upstreams_map.values()) + else: + merged_aspect.fineGrainedLineages = ( + new_aspect.fineGrainedLineages or gms_aspect.fineGrainedLineages + ) + + return merged_aspect + + +def _lineage_wu_via_read_modify_write( + graph: Optional[DataHubGraph], + urn: str, + aspect: UpstreamLineageClass, + system_metadata: Optional[SystemMetadataClass], +) -> MetadataWorkUnit: + if graph is None: + raise ValueError( + "Failed to handle incremental lineage, DataHubGraph is missing. " + "Use `datahub-rest` sink OR provide `datahub-api` config in recipe. " + ) + gms_aspect = graph.get_aspect(urn, UpstreamLineageClass) + if gms_aspect: + new_aspect = _merge_upstream_lineage(aspect, gms_aspect) + else: + new_aspect = aspect + + return MetadataChangeProposalWrapper( + entityUrn=urn, aspect=new_aspect, systemMetadata=system_metadata + ).as_workunit() + + +def auto_incremental_lineage( + graph: Optional[DataHubGraph], + incremental_lineage: bool, + stream: Iterable[MetadataWorkUnit], +) -> Iterable[MetadataWorkUnit]: + if not incremental_lineage: + yield from stream + return # early exit + + for wu in stream: + lineage_aspect: Optional[UpstreamLineageClass] = wu.get_aspect_of_type( + UpstreamLineageClass + ) + urn = wu.get_urn() + + if lineage_aspect: + if isinstance(wu.metadata, MetadataChangeEventClass): + set_aspect( + wu.metadata, None, UpstreamLineageClass + ) # we'll emit upstreamLineage separately below + if len(wu.metadata.proposedSnapshot.aspects) > 0: + yield wu + + if lineage_aspect.fineGrainedLineages: + yield _lineage_wu_via_read_modify_write( + graph, urn, lineage_aspect, wu.metadata.systemMetadata + ) + elif lineage_aspect.upstreams: + yield _convert_upstream_lineage_to_patch( + urn, lineage_aspect, wu.metadata.systemMetadata + ) + else: + yield wu diff --git a/metadata-ingestion/src/datahub/ingestion/api/source.py b/metadata-ingestion/src/datahub/ingestion/api/source.py index 0bcc220cad49bf..8940642f7008a7 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source.py @@ -29,6 +29,7 @@ from datahub.ingestion.api.report import Report from datahub.ingestion.api.source_helpers import ( auto_browse_path_v2, + auto_lowercase_urns, auto_materialize_referenced_tags, auto_status_aspect, auto_workunit_reporter, @@ -192,7 +193,31 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: self.ctx.pipeline_config.flags.generate_browse_path_v2_dry_run ) + auto_lowercase_dataset_urns: Optional[MetadataWorkUnitProcessor] = None + if ( + self.ctx.pipeline_config + and self.ctx.pipeline_config.source + and self.ctx.pipeline_config.source.config + and ( + ( + hasattr( + self.ctx.pipeline_config.source.config, + "convert_urns_to_lowercase", + ) + and self.ctx.pipeline_config.source.config.convert_urns_to_lowercase + ) + or ( + hasattr(self.ctx.pipeline_config.source.config, "get") + and self.ctx.pipeline_config.source.config.get( + "convert_urns_to_lowercase" + ) + ) + ) + ): + auto_lowercase_dataset_urns = auto_lowercase_urns + return [ + auto_lowercase_dataset_urns, auto_status_aspect, auto_materialize_referenced_tags, browse_path_processor, diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py index 7fc15cf829678b..2ce9e07bc57bc8 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py +++ b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py @@ -35,7 +35,7 @@ from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.tag_urn import TagUrn from datahub.utilities.urns.urn import guess_entity_type -from datahub.utilities.urns.urn_iter import list_urns +from datahub.utilities.urns.urn_iter import list_urns, lowercase_dataset_urns if TYPE_CHECKING: from datahub.ingestion.api.source import SourceReport @@ -70,7 +70,6 @@ def auto_status_aspect( for wu in stream: urn = wu.get_urn() all_urns.add(urn) - if not wu.is_primary_source: # If this is a non-primary source, we pretend like we've seen the status # aspect so that we don't try to emit a removal for it. @@ -173,6 +172,23 @@ def auto_materialize_referenced_tags( ).as_workunit() +def auto_lowercase_urns( + stream: Iterable[MetadataWorkUnit], +) -> Iterable[MetadataWorkUnit]: + """Lowercase all dataset urns""" + + for wu in stream: + try: + old_urn = wu.get_urn() + lowercase_dataset_urns(wu.metadata) + wu.id = wu.id.replace(old_urn, wu.get_urn()) + + yield wu + except Exception as e: + logger.warning(f"Failed to lowercase urns for {wu}: {e}", exc_info=True) + yield wu + + def auto_browse_path_v2( stream: Iterable[MetadataWorkUnit], *, diff --git a/metadata-ingestion/src/datahub/ingestion/api/workunit.py b/metadata-ingestion/src/datahub/ingestion/api/workunit.py index 8eea3514a22afa..b1c003ee27e125 100644 --- a/metadata-ingestion/src/datahub/ingestion/api/workunit.py +++ b/metadata-ingestion/src/datahub/ingestion/api/workunit.py @@ -22,7 +22,11 @@ class MetadataWorkUnit(WorkUnit): metadata: Union[ MetadataChangeEvent, MetadataChangeProposal, MetadataChangeProposalWrapper ] - # A workunit creator can determine if this workunit is allowed to fail + + # A workunit creator can determine if this workunit is allowed to fail. + # TODO: This flag was initially added during the rollout of the subType aspect + # to improve backwards compatibility, but is not really needed anymore and so + # should be removed. treat_errors_as_warnings: bool = False # When this is set to false, this MWU will be ignored by automatic helpers diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py index c943b83a887edb..360ddf1129154b 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/json_schema_util.py @@ -598,7 +598,8 @@ def get_fields_from_schema( jsonref_schema_dict = schema_dict else: # first validate the schema using a json validator - jsonschema.Draft7Validator.check_schema(schema_dict) + validator = jsonschema.validators.validator_for(schema_dict) + validator.check_schema(schema_dict) # then apply jsonref jsonref_schema_dict = jsonref.loads(schema_string) except Exception as e: diff --git a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py index 4acf99a50e50ed..df0b732833fbe1 100644 --- a/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py +++ b/metadata-ingestion/src/datahub/ingestion/extractor/schema_util.py @@ -1,6 +1,18 @@ import json import logging -from typing import Any, Callable, Dict, Generator, List, Optional, Type, Union +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Mapping, + Optional, + Type, + Union, + cast, + overload, +) import avro.schema @@ -54,6 +66,8 @@ avro.schema.PrimitiveSchema, ] +SchemaOrField = Union[avro.schema.Schema, avro.schema.Field] + FieldStack = List[avro.schema.Field] # The latest avro code contains this type definition in a compatibility module, @@ -124,16 +138,22 @@ def __init__( self._meta_mapping_processor = meta_mapping_processor self._schema_tags_field = schema_tags_field self._tag_prefix = tag_prefix + # Map of avro schema type to the conversion handler - self._avro_type_to_mce_converter_map: Dict[ - avro.schema.Schema, - Callable[[ExtendedAvroNestedSchemas], Generator[SchemaField, None, None]], + # TODO: Clean up this type... perhaps refactor + self._avro_type_to_mce_converter_map: Mapping[ + Union[ + Type[avro.schema.Schema], + Type[avro.schema.Field], + Type[avro.schema.LogicalSchema], + ], + Callable[[SchemaOrField], Iterable[SchemaField]], ] = { avro.schema.RecordSchema: self._gen_from_non_field_nested_schemas, avro.schema.UnionSchema: self._gen_from_non_field_nested_schemas, avro.schema.ArraySchema: self._gen_from_non_field_nested_schemas, avro.schema.MapSchema: self._gen_from_non_field_nested_schemas, - avro.schema.Field: self._gen_nested_schema_from_field, + avro.schema.Field: self._gen_nested_schema_from_field, # type: ignore avro.schema.PrimitiveSchema: self._gen_non_nested_to_mce_fields, avro.schema.FixedSchema: self._gen_non_nested_to_mce_fields, avro.schema.EnumSchema: self._gen_non_nested_to_mce_fields, @@ -142,20 +162,22 @@ def __init__( @staticmethod def _get_type_name( - avro_schema: avro.schema.Schema, logical_if_present: bool = False + avro_schema: SchemaOrField, logical_if_present: bool = False ) -> str: logical_type_name: Optional[str] = None if logical_if_present: - logical_type_name = getattr( - avro_schema, "logical_type", None - ) or avro_schema.props.get("logicalType") + logical_type_name = cast( + Optional[str], + getattr(avro_schema, "logical_type", None) + or avro_schema.props.get("logicalType"), + ) return logical_type_name or str( getattr(avro_schema.type, "type", avro_schema.type) ) @staticmethod def _get_column_type( - avro_schema: avro.schema.Schema, logical_type: Optional[str] + avro_schema: SchemaOrField, logical_type: Optional[str] ) -> SchemaFieldDataType: type_name: str = AvroToMceSchemaConverter._get_type_name(avro_schema) TypeClass: Optional[Type] = AvroToMceSchemaConverter.field_type_mapping.get( @@ -186,7 +208,7 @@ def _get_column_type( ) return dt - def _is_nullable(self, schema: avro.schema.Schema) -> bool: + def _is_nullable(self, schema: SchemaOrField) -> bool: if isinstance(schema, avro.schema.Field): return self._is_nullable(schema.type) if isinstance(schema, avro.schema.UnionSchema): @@ -208,7 +230,7 @@ def _strip_namespace(name_or_fullname: str) -> str: return name_or_fullname.rsplit(".", maxsplit=1)[-1] @staticmethod - def _get_simple_native_type(schema: ExtendedAvroNestedSchemas) -> str: + def _get_simple_native_type(schema: SchemaOrField) -> str: if isinstance(schema, (avro.schema.RecordSchema, avro.schema.Field)): # For Records, fields, always return the name. return AvroToMceSchemaConverter._strip_namespace(schema.name) @@ -226,7 +248,7 @@ def _get_simple_native_type(schema: ExtendedAvroNestedSchemas) -> str: return schema.type @staticmethod - def _get_type_annotation(schema: ExtendedAvroNestedSchemas) -> str: + def _get_type_annotation(schema: SchemaOrField) -> str: simple_native_type = AvroToMceSchemaConverter._get_simple_native_type(schema) if simple_native_type.startswith("__struct_"): simple_native_type = "struct" @@ -237,10 +259,24 @@ def _get_type_annotation(schema: ExtendedAvroNestedSchemas) -> str: else: return f"[type={simple_native_type}]" + @staticmethod + @overload + def _get_underlying_type_if_option_as_union( + schema: SchemaOrField, default: SchemaOrField + ) -> SchemaOrField: + ... + + @staticmethod + @overload + def _get_underlying_type_if_option_as_union( + schema: SchemaOrField, default: Optional[SchemaOrField] = None + ) -> Optional[SchemaOrField]: + ... + @staticmethod def _get_underlying_type_if_option_as_union( - schema: AvroNestedSchemas, default: Optional[AvroNestedSchemas] = None - ) -> AvroNestedSchemas: + schema: SchemaOrField, default: Optional[SchemaOrField] = None + ) -> Optional[SchemaOrField]: if isinstance(schema, avro.schema.UnionSchema) and len(schema.schemas) == 2: (first, second) = schema.schemas if first.type == AVRO_TYPE_NULL: @@ -258,8 +294,8 @@ class SchemaFieldEmissionContextManager: def __init__( self, - schema: avro.schema.Schema, - actual_schema: avro.schema.Schema, + schema: SchemaOrField, + actual_schema: SchemaOrField, converter: "AvroToMceSchemaConverter", description: Optional[str] = None, default_value: Optional[str] = None, @@ -275,7 +311,7 @@ def __enter__(self): self._converter._prefix_name_stack.append(type_annotation) return self - def emit(self) -> Generator[SchemaField, None, None]: + def emit(self) -> Iterable[SchemaField]: if ( not isinstance( self._actual_schema, @@ -307,7 +343,7 @@ def emit(self) -> Generator[SchemaField, None, None]: description = self._description if not description and actual_schema.props.get("doc"): - description = actual_schema.props.get("doc") + description = cast(Optional[str], actual_schema.props.get("doc")) if self._default_value is not None: description = f"{description if description else ''}\nField default value: {self._default_value}" @@ -320,12 +356,12 @@ def emit(self) -> Generator[SchemaField, None, None]: native_data_type = native_data_type[ slice(len(type_prefix), len(native_data_type) - 1) ] - native_data_type = actual_schema.props.get( - "native_data_type", native_data_type + native_data_type = cast( + str, actual_schema.props.get("native_data_type", native_data_type) ) field_path = self._converter._get_cur_field_path() - merged_props = {} + merged_props: Dict[str, Any] = {} merged_props.update(self._schema.other_props) merged_props.update(schema.other_props) @@ -363,12 +399,13 @@ def emit(self) -> Generator[SchemaField, None, None]: meta_terms_aspect = meta_aspects.get(Constants.ADD_TERM_OPERATION) - logical_type_name: Optional[str] = ( + logical_type_name: Optional[str] = cast( + Optional[str], # logicalType nested inside type getattr(actual_schema, "logical_type", None) or actual_schema.props.get("logicalType") # bare logicalType - or self._actual_schema.props.get("logicalType") + or self._actual_schema.props.get("logicalType"), ) field = SchemaField( @@ -392,14 +429,12 @@ def emit(self) -> Generator[SchemaField, None, None]: def __exit__(self, exc_type, exc_val, exc_tb): self._converter._prefix_name_stack.pop() - def _get_sub_schemas( - self, schema: ExtendedAvroNestedSchemas - ) -> Generator[avro.schema.Schema, None, None]: + def _get_sub_schemas(self, schema: SchemaOrField) -> Iterable[SchemaOrField]: """Responsible for generation for appropriate sub-schemas for every nested AVRO type.""" def gen_items_from_list_tuple_or_scalar( val: Any, - ) -> Generator[avro.schema.Schema, None, None]: + ) -> Iterable[avro.schema.Schema]: if isinstance(val, (list, tuple)): for i in val: yield i @@ -433,7 +468,7 @@ def gen_items_from_list_tuple_or_scalar( def _gen_nested_schema_from_field( self, field: avro.schema.Field, - ) -> Generator[SchemaField, None, None]: + ) -> Iterable[SchemaField]: """Handles generation of MCE SchemaFields for an AVRO Field type.""" # NOTE: Here we only manage the field stack and trigger MCE Field generation from this field's type. # The actual emitting of a field happens when @@ -447,7 +482,7 @@ def _gen_nested_schema_from_field( def _gen_from_last_field( self, schema_to_recurse: Optional[AvroNestedSchemas] = None - ) -> Generator[SchemaField, None, None]: + ) -> Iterable[SchemaField]: """Emits the field most-recent field, optionally triggering sub-schema generation under the field.""" last_field_schema = self._fields_stack[-1] # Generate the custom-description for the field. @@ -467,8 +502,8 @@ def _gen_from_last_field( yield from self._to_mce_fields(sub_schema) def _gen_from_non_field_nested_schemas( - self, schema: AvroNestedSchemas - ) -> Generator[SchemaField, None, None]: + self, schema: SchemaOrField + ) -> Iterable[SchemaField]: """Handles generation of MCE SchemaFields for all standard AVRO nested types.""" # Handle recursive record definitions recurse: bool = True @@ -511,8 +546,8 @@ def _gen_from_non_field_nested_schemas( yield from self._to_mce_fields(sub_schema) def _gen_non_nested_to_mce_fields( - self, schema: AvroNonNestedSchemas - ) -> Generator[SchemaField, None, None]: + self, schema: SchemaOrField + ) -> Iterable[SchemaField]: """Handles generation of MCE SchemaFields for non-nested AVRO types.""" with AvroToMceSchemaConverter.SchemaFieldEmissionContextManager( schema, @@ -521,9 +556,7 @@ def _gen_non_nested_to_mce_fields( ) as non_nested_emitter: yield from non_nested_emitter.emit() - def _to_mce_fields( - self, avro_schema: avro.schema.Schema - ) -> Generator[SchemaField, None, None]: + def _to_mce_fields(self, avro_schema: SchemaOrField) -> Iterable[SchemaField]: # Invoke the relevant conversion handler for the schema element type. schema_type = ( type(avro_schema) @@ -541,7 +574,7 @@ def to_mce_fields( meta_mapping_processor: Optional[OperationProcessor] = None, schema_tags_field: Optional[str] = None, tag_prefix: Optional[str] = None, - ) -> Generator[SchemaField, None, None]: + ) -> Iterable[SchemaField]: """ Converts a key or value type AVRO schema string to appropriate MCE SchemaFields. :param avro_schema_string: String representation of the AVRO schema. diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index 673ada4f730519..ccff677c3a4716 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -7,7 +7,7 @@ from dataclasses import dataclass from datetime import datetime from json.decoder import JSONDecodeError -from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Set, Tuple, Type +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type from avro.schema import RecordSchema from deprecated import deprecated @@ -138,6 +138,23 @@ def __init__(self, config: DatahubClientConfig) -> None: self.server_id = "missing" logger.debug(f"Failed to get server id due to {e}") + @classmethod + def from_emitter(cls, emitter: DatahubRestEmitter) -> "DataHubGraph": + return cls( + DatahubClientConfig( + server=emitter._gms_server, + token=emitter._token, + timeout_sec=emitter._read_timeout_sec, + retry_status_codes=emitter._retry_status_codes, + retry_max_times=emitter._retry_max_times, + extra_headers=emitter._session.headers, + disable_ssl_verification=emitter._session.verify is False, + # TODO: Support these headers. + # ca_certificate_path=emitter._ca_certificate_path, + # client_certificate_path=emitter._client_certificate_path, + ) + ) + def _send_restli_request(self, method: str, url: str, **kwargs: Any) -> Dict: try: response = self._session.request(method, url, **kwargs) @@ -993,14 +1010,13 @@ def _make_schema_resolver( def initialize_schema_resolver_from_datahub( self, platform: str, platform_instance: Optional[str], env: str - ) -> Tuple["SchemaResolver", Set[str]]: + ) -> "SchemaResolver": logger.info("Initializing schema resolver") schema_resolver = self._make_schema_resolver( platform, platform_instance, env, include_graph=False ) logger.info(f"Fetching schemas for platform {platform}, env {env}") - urns = [] count = 0 with PerfTimer() as timer: for urn, schema_info in self._bulk_fetch_schema_info_by_filter( @@ -1009,7 +1025,6 @@ def initialize_schema_resolver_from_datahub( env=env, ): try: - urns.append(urn) schema_resolver.add_graphql_schema_metadata(urn, schema_info) count += 1 except Exception: @@ -1024,7 +1039,7 @@ def initialize_schema_resolver_from_datahub( ) logger.info("Finished initializing schema resolver") - return schema_resolver, set(urns) + return schema_resolver def parse_sql_lineage( self, diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py index 79d959965e0dd7..f2735c24ca19dc 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline.py @@ -353,77 +353,97 @@ def _time_to_print(self) -> bool: return False def run(self) -> None: - self.final_status = "unknown" - self._notify_reporters_on_ingestion_start() - callback = None - try: - callback = ( - LoggingCallback() - if not self.config.failure_log.enabled - else DeadLetterQueueCallback( - self.ctx, self.config.failure_log.log_config - ) - ) - for wu in itertools.islice( - self.source.get_workunits(), - self.preview_workunits if self.preview_mode else None, - ): - try: - if self._time_to_print(): - self.pretty_print_summary(currently_running=True) - except Exception as e: - logger.warning(f"Failed to print summary {e}") - - if not self.dry_run: - self.sink.handle_work_unit_start(wu) - try: - record_envelopes = self.extractor.get_records(wu) - for record_envelope in self.transform(record_envelopes): - if not self.dry_run: - self.sink.write_record_async(record_envelope, callback) - - except RuntimeError: - raise - except SystemExit: - raise - except Exception as e: - logger.error( - "Failed to process some records. Continuing.", exc_info=e + with contextlib.ExitStack() as stack: + if self.config.flags.generate_memory_profiles: + import memray + + stack.enter_context( + memray.Tracker( + f"{self.config.flags.generate_memory_profiles}/{self.config.run_id}.bin" ) - # TODO: Transformer errors should cause the pipeline to fail. - - self.extractor.close() - if not self.dry_run: - self.sink.handle_work_unit_end(wu) - self.source.close() - # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state - for record_envelope in self.transform( - [ - RecordEnvelope( - record=EndOfStream(), metadata={"workunit_id": "end-of-stream"} + ) + + self.final_status = "unknown" + self._notify_reporters_on_ingestion_start() + callback = None + try: + callback = ( + LoggingCallback() + if not self.config.failure_log.enabled + else DeadLetterQueueCallback( + self.ctx, self.config.failure_log.log_config ) - ] - ): - if not self.dry_run and not isinstance( - record_envelope.record, EndOfStream + ) + for wu in itertools.islice( + self.source.get_workunits(), + self.preview_workunits if self.preview_mode else None, + ): + try: + if self._time_to_print(): + self.pretty_print_summary(currently_running=True) + except Exception as e: + logger.warning(f"Failed to print summary {e}") + + if not self.dry_run: + self.sink.handle_work_unit_start(wu) + try: + record_envelopes = self.extractor.get_records(wu) + for record_envelope in self.transform(record_envelopes): + if not self.dry_run: + try: + self.sink.write_record_async( + record_envelope, callback + ) + except Exception as e: + # In case the sink's error handling is bad, we still want to report the error. + self.sink.report.report_failure( + f"Failed to write record: {e}" + ) + + except RuntimeError: + raise + except SystemExit: + raise + except Exception as e: + logger.error( + "Failed to process some records. Continuing.", + exc_info=e, + ) + # TODO: Transformer errors should cause the pipeline to fail. + + self.extractor.close() + if not self.dry_run: + self.sink.handle_work_unit_end(wu) + self.source.close() + # no more data is coming, we need to let the transformers produce any additional records if they are holding on to state + for record_envelope in self.transform( + [ + RecordEnvelope( + record=EndOfStream(), + metadata={"workunit_id": "end-of-stream"}, + ) + ] ): - # TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc. - self.sink.write_record_async(record_envelope, callback) - - self.sink.close() - self.process_commits() - self.final_status = "completed" - except (SystemExit, RuntimeError, KeyboardInterrupt) as e: - self.final_status = "cancelled" - logger.error("Caught error", exc_info=e) - raise - finally: - clear_global_warnings() - - if callback and hasattr(callback, "close"): - callback.close() # type: ignore - - self._notify_reporters_on_ingestion_completion() + if not self.dry_run and not isinstance( + record_envelope.record, EndOfStream + ): + # TODO: propagate EndOfStream and other control events to sinks, to allow them to flush etc. + self.sink.write_record_async(record_envelope, callback) + + self.sink.close() + self.process_commits() + self.final_status = "completed" + except (SystemExit, RuntimeError, KeyboardInterrupt) as e: + self.final_status = "cancelled" + logger.error("Caught error", exc_info=e) + raise + finally: + clear_global_warnings() + + if callback and hasattr(callback, "close"): + callback.close() # type: ignore + + self._notify_reporters_on_ingestion_completion() def transform(self, records: Iterable[RecordEnvelope]) -> Iterable[RecordEnvelope]: """ diff --git a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py index ff9a7a6f3d146b..da3cee8ad9c1b8 100644 --- a/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py +++ b/metadata-ingestion/src/datahub/ingestion/run/pipeline_config.py @@ -57,6 +57,13 @@ class FlagsConfig(ConfigModel): ), ) + generate_memory_profiles: Optional[str] = Field( + default=None, + description=( + "Generate memray memory dumps for ingestion process by providing a path to write the dump file in." + ), + ) + class PipelineConfig(ConfigModel): # Once support for discriminated unions gets merged into Pydantic, we can diff --git a/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py b/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py index 39054c256a7fd5..38ddadaafc862c 100644 --- a/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/sink/datahub_kafka.py @@ -9,7 +9,6 @@ MetadataChangeEvent, MetadataChangeProposal, ) -from datahub.metadata.schema_classes import MetadataChangeProposalClass class KafkaSinkConfig(KafkaEmitterConfig): @@ -58,27 +57,21 @@ def write_record_async( ], write_callback: WriteCallback, ) -> None: - record = record_envelope.record - if isinstance(record, MetadataChangeEvent): - self.emitter.emit_mce_async( + callback = _KafkaCallback( + self.report, record_envelope, write_callback + ).kafka_callback + try: + record = record_envelope.record + self.emitter.emit( record, - callback=_KafkaCallback( - self.report, record_envelope, write_callback - ).kafka_callback, - ) - elif isinstance( - record, (MetadataChangeProposalWrapper, MetadataChangeProposalClass) - ): - self.emitter.emit_mcp_async( - record, - callback=_KafkaCallback( - self.report, record_envelope, write_callback - ).kafka_callback, - ) - else: - raise ValueError( - f"The datahub-kafka sink only supports MetadataChangeEvent/MetadataChangeProposal[Wrapper] classes, not {type(record)}" + callback=callback, ) + except Exception as err: + # In case we throw an exception while trying to emit the record, + # catch it and report the failure. This might happen if the schema + # registry is down or otherwise misconfigured, in which case we'd + # fail when serializing the record. + callback(err, f"Failed to write record: {err}") def close(self) -> None: self.emitter.flush() diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index e5dff786b71d15..aa7e5aa352a3e2 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -221,6 +221,7 @@ def report_table_dropped(self, table: str) -> None: SourceCapability.DELETION_DETECTION, "Enabled by default when stateful ingestion is turned on.", ) +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") class GlueSource(StatefulIngestionSourceBase): """ Note: if you also have files in S3 that you'd like to ingest, we recommend you use Glue's built-in data catalog. See [here](../../../../docs/generated/ingestion/sources/s3.md) for a quick guide on how to set up a crawler on Glue and ingest the outputs with DataHub. diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py index 501162455cc456..878b8dd1bb9a51 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/s3_util.py @@ -34,21 +34,26 @@ def get_bucket_relative_path(s3_uri: str) -> str: return "/".join(strip_s3_prefix(s3_uri).split("/")[1:]) -def make_s3_urn(s3_uri: str, env: str) -> str: +def make_s3_urn(s3_uri: str, env: str, remove_extension: bool = True) -> str: s3_name = strip_s3_prefix(s3_uri) if s3_name.endswith("/"): s3_name = s3_name[:-1] name, extension = os.path.splitext(s3_name) - - if extension != "": + if remove_extension and extension != "": extension = extension[1:] # remove the dot return f"urn:li:dataset:(urn:li:dataPlatform:s3,{name}_{extension},{env})" return f"urn:li:dataset:(urn:li:dataPlatform:s3,{s3_name},{env})" +def make_s3_urn_for_lineage(s3_uri: str, env: str) -> str: + # Ideally this is the implementation for all S3 URNs + # Don't feel comfortable changing `make_s3_urn` for glue, sagemaker, and athena + return make_s3_urn(s3_uri, env, remove_extension=False) + + def get_bucket_name(s3_uri: str) -> str: if not is_s3_uri(s3_uri): raise ValueError( diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index 8a16b1a4a5f6ba..6959a483130106 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -16,7 +16,6 @@ make_dataplatform_instance_urn, make_dataset_urn, make_tag_urn, - set_dataset_urn_to_lower, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import BigQueryDatasetKey, ContainerKey, ProjectIdKey @@ -154,6 +153,7 @@ def cleanup(config: BigQueryV2Config) -> None: ) @capability(SourceCapability.DESCRIPTIONS, "Enabled by default") @capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") +@capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration") @capability( SourceCapability.USAGE_STATS, "Enabled by default, can be disabled via configuration `include_usage_statistics`", @@ -218,8 +218,6 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): if self.config.enable_legacy_sharded_table_support: BigqueryTableIdentifier._BQ_SHARDED_TABLE_SUFFIX = "" - set_dataset_urn_to_lower(self.config.convert_urns_to_lowercase) - self.bigquery_data_dictionary = BigQuerySchemaApi( self.report.schema_api_perf, self.config.get_bigquery_client() ) @@ -458,10 +456,11 @@ def _init_schema_resolver(self) -> SchemaResolver: platform=self.platform, platform_instance=self.config.platform_instance, env=self.config.env, - )[0] + ) else: logger.warning( - "Failed to load schema info from DataHub as DataHubGraph is missing.", + "Failed to load schema info from DataHub as DataHubGraph is missing. " + "Use `datahub-rest` sink OR provide `datahub-api` config in recipe. ", ) return SchemaResolver(platform=self.platform, env=self.config.env) @@ -600,9 +599,6 @@ def _process_project( db_views: Dict[str, List[BigqueryView]] = {} project_id = bigquery_project.id - - yield from self.gen_project_id_containers(project_id) - try: bigquery_project.datasets = ( self.bigquery_data_dictionary.get_datasets_for_project_id(project_id) @@ -619,11 +615,23 @@ def _process_project( return None if len(bigquery_project.datasets) == 0: - logger.warning( - f"No dataset found in {project_id}. Either there are no datasets in this project or missing bigquery.datasets.get permission. You can assign predefined roles/bigquery.metadataViewer role to your service account." + more_info = ( + "Either there are no datasets in this project or missing bigquery.datasets.get permission. " + "You can assign predefined roles/bigquery.metadataViewer role to your service account." ) + if self.config.exclude_empty_projects: + self.report.report_dropped(project_id) + warning_message = f"Excluded project '{project_id}' since no were datasets found. {more_info}" + else: + yield from self.gen_project_id_containers(project_id) + warning_message = ( + f"No datasets found in project '{project_id}'. {more_info}" + ) + logger.warning(warning_message) return + yield from self.gen_project_id_containers(project_id) + self.report.num_project_datasets_to_scan[project_id] = len( bigquery_project.datasets ) @@ -1042,11 +1050,18 @@ def gen_schema_fields(self, columns: List[BigqueryColumn]) -> List[SchemaField]: for idx, field in enumerate(schema_fields): # Remove all the [version=2.0].[type=struct]. tags to get the field path if ( - re.sub(r"\[.*?\]\.", "", field.fieldPath, 0, re.MULTILINE) - == col.field_path + re.sub( + r"\[.*?\]\.", + "", + field.fieldPath.lower(), + 0, + re.MULTILINE, + ) + == col.field_path.lower() ): field.description = col.comment schema_fields[idx] = field + break else: tags = [] if col.is_partition_column: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py index b0ac77201b415b..55366d6c57cf83 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit.py @@ -3,7 +3,7 @@ import re from dataclasses import dataclass, field from datetime import datetime -from typing import Any, ClassVar, Dict, List, Optional, Pattern, Set, Tuple, Union +from typing import Any, ClassVar, Dict, List, Optional, Pattern, Tuple, Union from dateutil import parser @@ -20,7 +20,13 @@ logger: logging.Logger = logging.getLogger(__name__) -_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX = "((.+)[_$])?(\\d{8})$" +# Regexp for sharded tables. +# A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date. +# The regexp checks for valid dates in the suffix (e.g. 20200101, 20200229, 20201231) and if the date is not valid +# then it is not a sharded table. +_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX = ( + "((.+\\D)[_$]?)?(\\d\\d\\d\\d(?:0[1-9]|1[0-2])(?:0[1-9]|[12][0-9]|3[01]))$" +) @dataclass(frozen=True, order=True) @@ -29,8 +35,6 @@ class BigqueryTableIdentifier: dataset: str table: str - invalid_chars: ClassVar[Set[str]] = {"$", "@"} - # Note: this regex may get overwritten by the sharded_table_pattern config. # The class-level constant, however, will not be overwritten. _BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: ClassVar[ @@ -40,7 +44,7 @@ class BigqueryTableIdentifier: _BQ_SHARDED_TABLE_SUFFIX: str = "_yyyymmdd" @staticmethod - def get_table_and_shard(table_name: str) -> Tuple[str, Optional[str]]: + def get_table_and_shard(table_name: str) -> Tuple[Optional[str], Optional[str]]: """ Args: table_name: @@ -53,16 +57,25 @@ def get_table_and_shard(table_name: str) -> Tuple[str, Optional[str]]: In case of non-sharded tables, returns (, None) In case of sharded tables, returns (, shard) """ + new_table_name = table_name match = re.match( BigqueryTableIdentifier._BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX, table_name, re.IGNORECASE, ) if match: - table_name = match.group(2) - shard = match.group(3) - return table_name, shard - return table_name, None + shard: str = match[3] + if shard: + if table_name.endswith(shard): + new_table_name = table_name[: -len(shard)] + + new_table_name = ( + new_table_name.rstrip("_") if new_table_name else new_table_name + ) + if new_table_name.endswith("."): + new_table_name = table_name + return (new_table_name, shard) if new_table_name else (None, shard) + return new_table_name, None @classmethod def from_string_name(cls, table: str) -> "BigqueryTableIdentifier": @@ -90,18 +103,7 @@ def get_table_display_name(self) -> str: ) table_name, _ = self.get_table_and_shard(shortened_table_name) - if not table_name: - table_name = self.dataset - - # Handle exceptions - invalid_chars_in_table_name: List[str] = [ - c for c in self.invalid_chars if c in table_name - ] - if invalid_chars_in_table_name: - raise ValueError( - f"Cannot handle {self.raw_table_name()} - poorly formatted table name, contains {invalid_chars_in_table_name}" - ) - return table_name + return table_name or self.dataset def get_table_name(self) -> str: """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py index 03b12c61ee5c6c..db552c09cd0a73 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_audit_log_api.py @@ -4,7 +4,6 @@ from google.cloud import bigquery from google.cloud.logging_v2.client import Client as GCPLoggingClient -from ratelimiter import RateLimiter from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( AuditLogEntry, @@ -17,6 +16,7 @@ BQ_DATE_SHARD_FORMAT, BQ_DATETIME_FORMAT, ) +from datahub.utilities.ratelimiter import RateLimiter logger: logging.Logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py index 3b06a4699c5660..f762d451849ab7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_config.py @@ -119,8 +119,8 @@ class BigQueryV2Config( ) match_fully_qualified_names: bool = Field( - default=False, - description="Whether `dataset_pattern` is matched against fully qualified dataset name `.`.", + default=True, + description="[deprecated] Whether `dataset_pattern` is matched against fully qualified dataset name `.`.", ) include_external_url: bool = Field( @@ -206,11 +206,6 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: description="This flag enables the data lineage extraction from Data Lineage API exposed by Google Data Catalog. NOTE: This extractor can't build views lineage. It's recommended to enable the view's DDL parsing. Read the docs to have more information about: https://cloud.google.com/data-catalog/docs/concepts/about-data-lineage", ) - convert_urns_to_lowercase: bool = Field( - default=False, - description="Convert urns to lowercase.", - ) - enable_legacy_sharded_table_support: bool = Field( default=True, description="Use the legacy sharded table urn suffix added.", @@ -265,6 +260,11 @@ def validate_column_lineage(cls, v: bool, values: Dict[str, Any]) -> bool: description="Maximum number of entries for the in-memory caches of FileBacked data structures.", ) + exclude_empty_projects: bool = Field( + default=False, + description="Option to exclude empty projects from being ingested.", + ) + @root_validator(pre=False) def profile_default_settings(cls, values: Dict) -> Dict: # Extra default SQLAlchemy option for better connection pooling and threading. @@ -299,7 +299,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict: "use project_id_pattern whenever possible. project_id will be deprecated, please use project_id_pattern only if possible." ) - dataset_pattern = values.get("dataset_pattern") + dataset_pattern: Optional[AllowDenyPattern] = values.get("dataset_pattern") schema_pattern = values.get("schema_pattern") if ( dataset_pattern == AllowDenyPattern.allow_all() @@ -309,6 +309,7 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict: "dataset_pattern is not set but schema_pattern is set, using schema_pattern as dataset_pattern. schema_pattern will be deprecated, please use dataset_pattern instead." ) values["dataset_pattern"] = schema_pattern + dataset_pattern = schema_pattern elif ( dataset_pattern != AllowDenyPattern.allow_all() and schema_pattern != AllowDenyPattern.allow_all() @@ -327,9 +328,24 @@ def backward_compatibility_configs_set(cls, values: Dict) -> Dict: ): logger.warning( "Please update `dataset_pattern` to match against fully qualified schema name `.` and set config `match_fully_qualified_names : True`." - "Current default `match_fully_qualified_names: False` is only to maintain backward compatibility. " - "The config option `match_fully_qualified_names` will be deprecated in future and the default behavior will assume `match_fully_qualified_names: True`." + "The config option `match_fully_qualified_names` is deprecated and will be removed in a future release." ) + elif match_fully_qualified_names and dataset_pattern is not None: + adjusted = False + for lst in [dataset_pattern.allow, dataset_pattern.deny]: + for i, pattern in enumerate(lst): + if "." not in pattern: + if pattern.startswith("^"): + lst[i] = r"^.*\." + pattern[1:] + else: + lst[i] = r".*\." + pattern + adjusted = True + if adjusted: + logger.warning( + "`dataset_pattern` was adjusted to match against fully qualified schema names," + " of the form `.`." + ) + return values def get_table_pattern(self, pattern: List[str]) -> str: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py index 661589a0c58e59..9d92b011ee2856 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_report.py @@ -122,6 +122,8 @@ class BigQueryV2Report(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowR usage_state_size: Optional[str] = None + exclude_empty_projects: Optional[bool] = None + schema_api_perf: BigQuerySchemaApiPerfReport = field( default_factory=BigQuerySchemaApiPerfReport ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py index 98c8cbaf85eec5..e9acf5ea860445 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/lineage.py @@ -20,6 +20,7 @@ from google.cloud.datacatalog import lineage_v1 from google.cloud.logging_v2.client import Client as GCPLoggingClient +from datahub.configuration.pattern_utils import is_schema_allowed from datahub.emitter import mce_builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit @@ -548,7 +549,7 @@ def _get_parsed_audit_log_events(self, project_id: str) -> Iterable[QueryEvent]: # handle the case where the read happens within our time range but the query # completion event is delayed and happens after the configured end time. corrected_start_time = self.start_time - self.config.max_query_duration - corrected_end_time = self.end_time + -self.config.max_query_duration + corrected_end_time = self.end_time + self.config.max_query_duration self.report.log_entry_start_time = corrected_start_time self.report.log_entry_end_time = corrected_end_time @@ -683,8 +684,11 @@ def _create_lineage_map( self.report.num_skipped_lineage_entries_missing_data[e.project_id] += 1 continue - if not self.config.dataset_pattern.allowed( - destination_table.table_identifier.dataset + if not is_schema_allowed( + self.config.dataset_pattern, + destination_table.table_identifier.dataset, + destination_table.table_identifier.project_id, + self.config.match_fully_qualified_names, ) or not self.config.table_pattern.allowed( destination_table.table_identifier.get_table_name() ): diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py index b3e88459917b39..8ae17600e0eeaf 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/profiler.py @@ -1,12 +1,9 @@ -import dataclasses import logging from datetime import datetime from typing import Dict, Iterable, List, Optional, Tuple, cast from dateutil.relativedelta import relativedelta -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config @@ -15,7 +12,7 @@ RANGE_PARTITION_NAME, BigqueryTable, ) -from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest +from datahub.ingestion.source.sql.sql_generic import BaseTable from datahub.ingestion.source.sql.sql_generic_profiler import ( GenericProfiler, TableProfilerRequest, @@ -25,12 +22,6 @@ logger = logging.getLogger(__name__) -@dataclasses.dataclass -class BigqueryProfilerRequest(GEProfilerRequest): - table: BigqueryTable - profile_table_level_only: bool = False - - class BigqueryProfiler(GenericProfiler): config: BigQueryV2Config report: BigQueryV2Report @@ -183,84 +174,54 @@ def get_workunits( ) # Emit the profile work unit - profile_request = self.get_bigquery_profile_request( - project=project_id, dataset=dataset, table=table - ) + profile_request = self.get_profile_request(table, dataset, project_id) if profile_request is not None: + self.report.report_entity_profiled(profile_request.pretty_name) profile_requests.append(profile_request) if len(profile_requests) == 0: return - yield from self.generate_wu_from_profile_requests(profile_requests) - - def generate_wu_from_profile_requests( - self, profile_requests: List[BigqueryProfilerRequest] - ) -> Iterable[MetadataWorkUnit]: - table_profile_requests = cast(List[TableProfilerRequest], profile_requests) - for request, profile in self.generate_profiles( - table_profile_requests, + yield from self.generate_profile_workunits( + profile_requests, self.config.profiling.max_workers, platform=self.platform, profiler_args=self.get_profile_args(), - ): - if request is None or profile is None: - continue - - request = cast(BigqueryProfilerRequest, request) - profile.sizeInBytes = request.table.size_in_bytes - # If table is partitioned we profile only one partition (if nothing set then the last one) - # but for table level we can use the rows_count from the table metadata - # This way even though column statistics only reflects one partition data but the rows count - # shows the proper count. - if profile.partitionSpec and profile.partitionSpec.partition: - profile.rowCount = request.table.rows_count - - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - # We don't add to the profiler state if we only do table level profiling as it always happens - if self.state_handler and not request.profile_table_level_only: - self.state_handler.add_to_state( - dataset_urn, int(datetime.now().timestamp() * 1000) - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=profile - ).as_workunit() + ) - def get_bigquery_profile_request( - self, project: str, dataset: str, table: BigqueryTable - ) -> Optional[BigqueryProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = BigqueryTableIdentifier( - project_id=project, dataset=dataset, table=table.name + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + return BigqueryTableIdentifier( + project_id=db_name, dataset=schema_name, table=table_name ).get_table_name() - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count - ): - profile_table_level_only = True - self.report.num_tables_not_eligible_profiling[f"{project}.{dataset}"] += 1 - if not table.column_count: - skip_profiling = True + def get_batch_kwargs( + self, table: BaseTable, schema_name: str, db_name: str + ) -> dict: + return dict( + schema=db_name, # + table=f"{schema_name}.{table.name}", # . + ) - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") + def get_profile_request( + self, table: BaseTable, schema_name: str, db_name: str + ) -> Optional[TableProfilerRequest]: + profile_request = super().get_profile_request(table, schema_name, db_name) + + if not profile_request: return None + # Below code handles profiling changes required for partitioned or sharded tables + # 1. Skip profile if partition profiling is disabled. + # 2. Else update `profile_request.batch_kwargs` with partition and custom_sql + + bq_table = cast(BigqueryTable, table) (partition, custom_sql) = self.generate_partition_profiler_query( - project, dataset, table, self.config.profiling.partition_datetime + db_name, schema_name, bq_table, self.config.profiling.partition_datetime ) - if partition is None and table.partition_info: + + if partition is None and bq_table.partition_info: self.report.report_warning( "profile skipped as partitioned table is empty or partition id or type was invalid", - dataset_name, + profile_request.pretty_name, ) return None if ( @@ -268,24 +229,20 @@ def get_bigquery_profile_request( and not self.config.profiling.partition_profiling_enabled ): logger.debug( - f"{dataset_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled" + f"{profile_request.pretty_name} and partition {partition} is skipped because profiling.partition_profiling_enabled property is disabled" ) self.report.profiling_skipped_partition_profiling_disabled.append( - dataset_name + profile_request.pretty_name ) return None - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = BigqueryProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict( - schema=project, - table=f"{dataset}.{table.name}", - custom_sql=custom_sql, - partition=partition, - ), - table=table, - profile_table_level_only=profile_table_level_only, - ) + if partition: + logger.debug("Updating profiling request for partitioned/sharded tables") + profile_request.batch_kwargs.update( + dict( + custom_sql=custom_sql, + partition=partition, + ) + ) + return profile_request diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py index a87cb8c1cbfa54..67fcc33cdf2182 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries.py @@ -51,8 +51,8 @@ class BigqueryQuery: p.max_partition_id, p.active_billable_bytes, p.long_term_billable_bytes, - REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, - REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base + REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix, + REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base FROM `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t @@ -92,8 +92,8 @@ class BigqueryQuery: tos.OPTION_VALUE as comment, t.is_insertable_into, t.ddl, - REGEXP_EXTRACT(t.table_name, r".*_(\\d+)$") as table_suffix, - REGEXP_REPLACE(t.table_name, r"_(\\d+)$", "") as table_base + REGEXP_EXTRACT(t.table_name, r"(?:(?:.+\\D)[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$") as table_suffix, + REGEXP_REPLACE(t.table_name, r"(?:[_$]?)(\\d\\d\\d\\d(?:0[1-9]|1[012])(?:0[1-9]|[12][0-9]|3[01]))$", "") as table_base FROM `{{project_id}}`.`{{dataset_name}}`.INFORMATION_SCHEMA.TABLES t diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py index 201567e104a510..65b559550ffc59 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/usage.py @@ -21,6 +21,7 @@ import humanfriendly +from datahub.configuration.pattern_utils import is_schema_allowed from datahub.configuration.time_window_config import ( BaseTimeWindowConfig, get_time_bucket, @@ -335,8 +336,13 @@ def get_time_window(self) -> Tuple[datetime, datetime]: def _is_table_allowed(self, table_ref: Optional[BigQueryTableRef]) -> bool: return ( table_ref is not None - and self.config.dataset_pattern.allowed(table_ref.table_identifier.dataset) - and self.config.table_pattern.allowed(table_ref.table_identifier.table) + and is_schema_allowed( + self.config.dataset_pattern, + table_ref.table_identifier.dataset, + table_ref.table_identifier.project_id, + self.config.match_fully_qualified_names, + ) + and self.config.table_pattern.allowed(str(table_ref.table_identifier)) ) def _should_ingest_usage(self) -> bool: @@ -844,7 +850,7 @@ def _get_parsed_bigquery_log_events( # handle the case where the read happens within our time range but the query # completion event is delayed and happens after the configured end time. corrected_start_time = self.start_time - self.config.max_query_duration - corrected_end_time = self.end_time + -self.config.max_query_duration + corrected_end_time = self.end_time + self.config.max_query_duration self.report.audit_start_time = corrected_start_time self.report.audit_end_time = corrected_end_time diff --git a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py index a2d89d26112f4e..741b4789bef216 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py +++ b/metadata-ingestion/src/datahub/ingestion/source/common/subtypes.py @@ -16,6 +16,9 @@ class DatasetSubTypes(str, Enum): SALESFORCE_STANDARD_OBJECT = "Object" POWERBI_DATASET_TABLE = "PowerBI Dataset Table" + # TODO: Create separate entity... + NOTEBOOK = "Notebook" + class DatasetContainerSubTypes(str, Enum): # Generic SubTypes diff --git a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py index 7cb487a86d9310..611f0c5c52cc65 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py +++ b/metadata-ingestion/src/datahub/ingestion/source/csv_enricher.py @@ -129,11 +129,9 @@ def __init__(self, config: CSVEnricherConfig, ctx: PipelineContext): # Map from entity urn to a list of SubResourceRow. self.editable_schema_metadata_map: Dict[str, List[SubResourceRow]] = {} self.should_overwrite: bool = self.config.write_semantics == "OVERRIDE" - if not self.should_overwrite and not self.ctx.graph: - raise ConfigurationError( - "With PATCH semantics, the csv-enricher source requires a datahub_api to connect to. " - "Consider using the datahub-rest sink or provide a datahub_api: configuration on your ingestion recipe." - ) + + if not self.should_overwrite: + self.ctx.require_graph(operation="The csv-enricher's PATCH semantics flag") def get_resource_glossary_terms_work_unit( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py index d1c949f48e2cd5..a35fb94614f722 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py +++ b/metadata-ingestion/src/datahub/ingestion/source/data_lake_common/path_spec.py @@ -18,7 +18,14 @@ logger: logging.Logger = logging.getLogger(__name__) SUPPORTED_FILE_TYPES: List[str] = ["csv", "tsv", "json", "parquet", "avro"] -SUPPORTED_COMPRESSIONS: List[str] = ["gz", "bz2"] + +# These come from the smart_open library. +SUPPORTED_COMPRESSIONS: List[str] = [ + "gz", + "bz2", + # We have a monkeypatch on smart_open that aliases .gzip to .gz. + "gzip", +] class PathSpec(ConfigModel): diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py index 053d136305527c..83958dc76754fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/config.py @@ -1,3 +1,4 @@ +import os from typing import Optional from pydantic import Field, root_validator @@ -67,9 +68,25 @@ class DataHubSourceConfig(StatefulIngestionConfigBase): ), ) + pull_from_datahub_api: bool = Field( + default=False, + description="Use the DataHub API to fetch versioned aspects.", + hidden_from_docs=True, + ) + + max_workers: int = Field( + default=5 * (os.cpu_count() or 4), + description="Number of worker threads to use for datahub api ingestion.", + hidden_from_docs=True, + ) + @root_validator def check_ingesting_data(cls, values): - if not values.get("database_connection") and not values.get("kafka_connection"): + if ( + not values.get("database_connection") + and not values.get("kafka_connection") + and not values.get("pull_from_datahub_api") + ): raise ValueError( "Your current config will not ingest any data." " Please specify at least one of `database_connection` or `kafka_connection`, ideally both." diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_api_reader.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_api_reader.py new file mode 100644 index 00000000000000..7ee36736723b24 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_api_reader.py @@ -0,0 +1,49 @@ +import logging +from concurrent import futures +from typing import Dict, Iterable, List + +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.graph.filters import RemovedStatusFilter +from datahub.ingestion.source.datahub.config import DataHubSourceConfig +from datahub.ingestion.source.datahub.report import DataHubSourceReport +from datahub.metadata._schema_classes import _Aspect + +logger = logging.getLogger(__name__) + +# Should work for at least mysql, mariadb, postgres +DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S.%f" + + +class DataHubApiReader: + def __init__( + self, + config: DataHubSourceConfig, + report: DataHubSourceReport, + graph: DataHubGraph, + ): + self.config = config + self.report = report + self.graph = graph + + def get_aspects(self) -> Iterable[MetadataChangeProposalWrapper]: + urns = self.graph.get_urns_by_filter( + status=RemovedStatusFilter.ALL, + batch_size=self.config.database_query_batch_size, + ) + tasks: List[futures.Future[Iterable[MetadataChangeProposalWrapper]]] = [] + with futures.ThreadPoolExecutor( + max_workers=self.config.max_workers + ) as executor: + for urn in urns: + tasks.append(executor.submit(self._get_aspects_for_urn, urn)) + for task in futures.as_completed(tasks): + yield from task.result() + + def _get_aspects_for_urn(self, urn: str) -> Iterable[MetadataChangeProposalWrapper]: + aspects: Dict[str, _Aspect] = self.graph.get_entity_semityped(urn) # type: ignore + for aspect in aspects.values(): + yield MetadataChangeProposalWrapper( + entityUrn=urn, + aspect=aspect, + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py index 2368febe1ff57e..a2f43b8cc62cb8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/datahub/datahub_source.py @@ -15,6 +15,7 @@ from datahub.ingestion.api.source_helpers import auto_workunit_reporter from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.datahub.config import DataHubSourceConfig +from datahub.ingestion.source.datahub.datahub_api_reader import DataHubApiReader from datahub.ingestion.source.datahub.datahub_database_reader import ( DataHubDatabaseReader, ) @@ -58,6 +59,9 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: logger.info(f"Ingesting DataHub metadata up until {self.report.stop_time}") state = self.stateful_ingestion_handler.get_last_run_state() + if self.config.pull_from_datahub_api: + yield from self._get_api_workunits() + if self.config.database_connection is not None: yield from self._get_database_workunits( from_createdon=state.database_createdon_datetime @@ -139,6 +143,18 @@ def _get_kafka_workunits( ) self._commit_progress(i) + def _get_api_workunits(self) -> Iterable[MetadataWorkUnit]: + if self.ctx.graph is None: + self.report.report_failure( + "datahub_api", + "Specify datahub_api on your ingestion recipe to ingest from the DataHub API", + ) + return + + reader = DataHubApiReader(self.config, self.report, self.ctx.graph) + for mcp in reader.get_aspects(): + yield mcp.as_workunit() + def _commit_progress(self, i: Optional[int] = None) -> None: """Commit progress to stateful storage, if there have been no errors. diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py index af9769bc9d94c9..da1ea8ecb4678a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py @@ -20,9 +20,8 @@ DBTCommonConfig, DBTNode, DBTSourceBase, - DBTTest, - DBTTestResult, ) +from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py index 782d94f39e8a55..c4de24bf192f16 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_common.py @@ -1,11 +1,10 @@ -import json import logging import re from abc import abstractmethod from dataclasses import dataclass, field from datetime import datetime from enum import auto -from typing import Any, Callable, ClassVar, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Dict, Iterable, List, Optional, Tuple import pydantic from pydantic import root_validator, validator @@ -18,8 +17,8 @@ ConfigurationError, LineageConfig, ) -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.emitter import mce_builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext @@ -34,6 +33,12 @@ from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import DatasetSubTypes +from datahub.ingestion.source.dbt.dbt_tests import ( + DBTTest, + DBTTestResult, + make_assertion_from_test, + make_assertion_result_from_test, +) from datahub.ingestion.source.sql.sql_types import ( ATHENA_SQL_TYPES_MAP, BIGQUERY_TYPES_MAP, @@ -81,20 +86,7 @@ TimeTypeClass, ) from datahub.metadata.schema_classes import ( - AssertionInfoClass, - AssertionResultClass, - AssertionResultTypeClass, - AssertionRunEventClass, - AssertionRunStatusClass, - AssertionStdAggregationClass, - AssertionStdOperatorClass, - AssertionStdParameterClass, - AssertionStdParametersClass, - AssertionStdParameterTypeClass, - AssertionTypeClass, DataPlatformInstanceClass, - DatasetAssertionInfoClass, - DatasetAssertionScopeClass, DatasetPropertiesClass, GlobalTagsClass, GlossaryTermsClass, @@ -214,7 +206,9 @@ class DBTCommonConfig( default=False, description="Use model identifier instead of model name if defined (if not, default to model name).", ) - _deprecate_use_identifiers = pydantic_field_deprecated("use_identifiers") + _deprecate_use_identifiers = pydantic_field_deprecated( + "use_identifiers", warn_if_value_is_not=False + ) entities_enabled: DBTEntitiesEnabled = Field( DBTEntitiesEnabled(), @@ -278,6 +272,19 @@ class DBTCommonConfig( description="When enabled, converts column URNs to lowercase to ensure cross-platform compatibility. " "If `target_platform` is Snowflake, the default is True.", ) + use_compiled_code: bool = Field( + default=False, + description="When enabled, uses the compiled dbt code instead of the raw dbt node definition.", + ) + test_warnings_are_errors: bool = Field( + default=False, + description="When enabled, dbt test warnings will be treated as failures.", + ) + # override fault value to True. + incremental_lineage: bool = Field( + default=True, + description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run.", + ) @validator("target_platform") def validate_target_platform_value(cls, target_platform: str) -> str: @@ -541,134 +548,6 @@ def get_column_type( return SchemaFieldDataType(type=TypeClass()) -@dataclass -class AssertionParams: - scope: Union[DatasetAssertionScopeClass, str] - operator: Union[AssertionStdOperatorClass, str] - aggregation: Union[AssertionStdAggregationClass, str] - parameters: Optional[Callable[[Dict[str, str]], AssertionStdParametersClass]] = None - logic_fn: Optional[Callable[[Dict[str, str]], Optional[str]]] = None - - -def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]: - """ - Try to produce a useful string for the name of a relationship constraint. - Return None if we fail to - """ - destination_ref = kw_args.get("to") - source_ref = kw_args.get("model") - column_name = kw_args.get("column_name") - dest_field_name = kw_args.get("field") - if not destination_ref or not source_ref or not column_name or not dest_field_name: - # base assertions are violated, bail early - return None - m = re.match(r"^ref\(\'(.*)\'\)$", destination_ref) - if m: - destination_table = m.group(1) - else: - destination_table = destination_ref - m = re.search(r"ref\(\'(.*)\'\)", source_ref) - if m: - source_table = m.group(1) - else: - source_table = source_ref - return f"{source_table}.{column_name} referential integrity to {destination_table}.{dest_field_name}" - - -@dataclass -class DBTTest: - qualified_test_name: str - column_name: Optional[str] - kw_args: dict - - TEST_NAME_TO_ASSERTION_MAP: ClassVar[Dict[str, AssertionParams]] = { - "not_null": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.NOT_NULL, - aggregation=AssertionStdAggregationClass.IDENTITY, - ), - "unique": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.EQUAL_TO, - aggregation=AssertionStdAggregationClass.UNIQUE_PROPOTION, - parameters=lambda _: AssertionStdParametersClass( - value=AssertionStdParameterClass( - value="1.0", - type=AssertionStdParameterTypeClass.NUMBER, - ) - ), - ), - "accepted_values": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.IN, - aggregation=AssertionStdAggregationClass.IDENTITY, - parameters=lambda kw_args: AssertionStdParametersClass( - value=AssertionStdParameterClass( - value=json.dumps(kw_args.get("values")), - type=AssertionStdParameterTypeClass.SET, - ), - ), - ), - "relationships": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass._NATIVE_, - aggregation=AssertionStdAggregationClass.IDENTITY, - parameters=lambda kw_args: AssertionStdParametersClass( - value=AssertionStdParameterClass( - value=json.dumps(kw_args.get("values")), - type=AssertionStdParameterTypeClass.SET, - ), - ), - logic_fn=_get_name_for_relationship_test, - ), - "dbt_expectations.expect_column_values_to_not_be_null": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.NOT_NULL, - aggregation=AssertionStdAggregationClass.IDENTITY, - ), - "dbt_expectations.expect_column_values_to_be_between": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.BETWEEN, - aggregation=AssertionStdAggregationClass.IDENTITY, - parameters=lambda x: AssertionStdParametersClass( - minValue=AssertionStdParameterClass( - value=str(x.get("min_value", "unknown")), - type=AssertionStdParameterTypeClass.NUMBER, - ), - maxValue=AssertionStdParameterClass( - value=str(x.get("max_value", "unknown")), - type=AssertionStdParameterTypeClass.NUMBER, - ), - ), - ), - "dbt_expectations.expect_column_values_to_be_in_set": AssertionParams( - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass.IN, - aggregation=AssertionStdAggregationClass.IDENTITY, - parameters=lambda kw_args: AssertionStdParametersClass( - value=AssertionStdParameterClass( - value=json.dumps(kw_args.get("value_set")), - type=AssertionStdParameterTypeClass.SET, - ), - ), - ), - } - - -@dataclass -class DBTTestResult: - invocation_id: str - - status: str - execution_time: datetime - - native_results: Dict[str, str] - - -def string_map(input_map: Dict[str, Any]) -> Dict[str, str]: - return {k: str(v) for k, v in input_map.items()} - - @platform_name("dbt") @config_class(DBTCommonConfig) @support_status(SupportStatus.CERTIFIED) @@ -701,18 +580,22 @@ def create_test_entity_mcps( assertion_urn = mce_builder.make_assertion_urn( mce_builder.datahub_guid( { - "platform": DBT_PLATFORM, - "name": node.dbt_name, - "instance": self.config.platform_instance, - **( - # Ideally we'd include the env unconditionally. However, we started out - # not including env in the guid, so we need to maintain backwards compatibility - # with existing PROD assertions. - {"env": self.config.env} - if self.config.env != mce_builder.DEFAULT_ENV - and self.config.include_env_in_assertion_guid - else {} - ), + k: v + for k, v in { + "platform": DBT_PLATFORM, + "name": node.dbt_name, + "instance": self.config.platform_instance, + **( + # Ideally we'd include the env unconditionally. However, we started out + # not including env in the guid, so we need to maintain backwards compatibility + # with existing PROD assertions. + {"env": self.config.env} + if self.config.env != mce_builder.DEFAULT_ENV + and self.config.include_env_in_assertion_guid + else {} + ), + }.items() + if v is not None } ) ) @@ -736,7 +619,7 @@ def create_test_entity_mcps( for upstream_urn in sorted(upstream_urns): if self.config.entities_enabled.can_emit_node_type("test"): - yield self._make_assertion_from_test( + yield make_assertion_from_test( custom_props, node, assertion_urn, @@ -745,129 +628,17 @@ def create_test_entity_mcps( if node.test_result: if self.config.entities_enabled.can_emit_test_results: - yield self._make_assertion_result_from_test( - node, assertion_urn, upstream_urn + yield make_assertion_result_from_test( + node, + assertion_urn, + upstream_urn, + test_warnings_are_errors=self.config.test_warnings_are_errors, ) else: logger.debug( f"Skipping test result {node.name} emission since it is turned off." ) - def _make_assertion_from_test( - self, - extra_custom_props: Dict[str, str], - node: DBTNode, - assertion_urn: str, - upstream_urn: str, - ) -> MetadataWorkUnit: - assert node.test_info - qualified_test_name = node.test_info.qualified_test_name - column_name = node.test_info.column_name - kw_args = node.test_info.kw_args - - if qualified_test_name in DBTTest.TEST_NAME_TO_ASSERTION_MAP: - assertion_params = DBTTest.TEST_NAME_TO_ASSERTION_MAP[qualified_test_name] - assertion_info = AssertionInfoClass( - type=AssertionTypeClass.DATASET, - customProperties=extra_custom_props, - datasetAssertion=DatasetAssertionInfoClass( - dataset=upstream_urn, - scope=assertion_params.scope, - operator=assertion_params.operator, - fields=[ - mce_builder.make_schema_field_urn(upstream_urn, column_name) - ] - if ( - assertion_params.scope - == DatasetAssertionScopeClass.DATASET_COLUMN - and column_name - ) - else [], - nativeType=node.name, - aggregation=assertion_params.aggregation, - parameters=assertion_params.parameters(kw_args) - if assertion_params.parameters - else None, - logic=assertion_params.logic_fn(kw_args) - if assertion_params.logic_fn - else None, - nativeParameters=string_map(kw_args), - ), - ) - elif column_name: - # no match with known test types, column-level test - assertion_info = AssertionInfoClass( - type=AssertionTypeClass.DATASET, - customProperties=extra_custom_props, - datasetAssertion=DatasetAssertionInfoClass( - dataset=upstream_urn, - scope=DatasetAssertionScopeClass.DATASET_COLUMN, - operator=AssertionStdOperatorClass._NATIVE_, - fields=[ - mce_builder.make_schema_field_urn(upstream_urn, column_name) - ], - nativeType=node.name, - logic=node.compiled_code if node.compiled_code else node.raw_code, - aggregation=AssertionStdAggregationClass._NATIVE_, - nativeParameters=string_map(kw_args), - ), - ) - else: - # no match with known test types, default to row-level test - assertion_info = AssertionInfoClass( - type=AssertionTypeClass.DATASET, - customProperties=extra_custom_props, - datasetAssertion=DatasetAssertionInfoClass( - dataset=upstream_urn, - scope=DatasetAssertionScopeClass.DATASET_ROWS, - operator=AssertionStdOperatorClass._NATIVE_, - logic=node.compiled_code if node.compiled_code else node.raw_code, - nativeType=node.name, - aggregation=AssertionStdAggregationClass._NATIVE_, - nativeParameters=string_map(kw_args), - ), - ) - - wu = MetadataChangeProposalWrapper( - entityUrn=assertion_urn, - aspect=assertion_info, - ).as_workunit() - - return wu - - def _make_assertion_result_from_test( - self, - node: DBTNode, - assertion_urn: str, - upstream_urn: str, - ) -> MetadataWorkUnit: - assert node.test_result - test_result = node.test_result - - assertionResult = AssertionRunEventClass( - timestampMillis=int(test_result.execution_time.timestamp() * 1000.0), - assertionUrn=assertion_urn, - asserteeUrn=upstream_urn, - runId=test_result.invocation_id, - result=AssertionResultClass( - type=AssertionResultTypeClass.SUCCESS - if test_result.status == "pass" - else AssertionResultTypeClass.FAILURE, - nativeResults=test_result.native_results, - ), - status=AssertionRunStatusClass.COMPLETE, - ) - - event = MetadataChangeProposalWrapper( - entityUrn=assertion_urn, - aspect=assertionResult, - ) - wu = MetadataWorkUnit( - id=f"{assertion_urn}-assertionRunEvent-{upstream_urn}", - mcp=event, - ) - return wu - @abstractmethod def load_nodes(self) -> Tuple[List[DBTNode], Dict[str, Optional[str]]]: # return dbt nodes + global custom properties @@ -1003,8 +774,8 @@ def create_platform_mces( aspects.append(upstream_lineage_class) # add view properties aspect - if node.raw_code and node.language == "sql": - view_prop_aspect = self._create_view_properties_aspect(node) + view_prop_aspect = self._create_view_properties_aspect(node) + if view_prop_aspect: aspects.append(view_prop_aspect) # emit subtype mcp @@ -1129,14 +900,21 @@ def _create_dataset_properties_aspect( def get_external_url(self, node: DBTNode) -> Optional[str]: pass - def _create_view_properties_aspect(self, node: DBTNode) -> ViewPropertiesClass: + def _create_view_properties_aspect( + self, node: DBTNode + ) -> Optional[ViewPropertiesClass]: + view_logic = ( + node.compiled_code if self.config.use_compiled_code else node.raw_code + ) + + if node.language != "sql" or not view_logic: + return None + materialized = node.materialization in {"table", "incremental", "snapshot"} - # this function is only called when raw sql is present. assert is added to satisfy lint checks - assert node.raw_code is not None view_properties = ViewPropertiesClass( materialized=materialized, viewLanguage="SQL", - viewLogic=node.raw_code, + viewLogic=view_logic, ) return view_properties @@ -1188,9 +966,15 @@ def _generate_base_aspects( ): aspects.append(meta_aspects.get(Constants.ADD_TERM_OPERATION)) + # add meta links aspect + meta_links_aspect = meta_aspects.get(Constants.ADD_DOC_LINK_OPERATION) + if meta_links_aspect and self.config.enable_meta_mapping: + aspects.append(meta_links_aspect) + # add schema metadata aspect schema_metadata = self.get_schema_metadata(self.report, node, mce_platform) aspects.append(schema_metadata) + return aspects def get_schema_metadata( diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py index c08295ed1dc593..dc3a84847beb24 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_core.py @@ -26,9 +26,8 @@ DBTNode, DBTSourceBase, DBTSourceReport, - DBTTest, - DBTTestResult, ) +from datahub.ingestion.source.dbt.dbt_tests import DBTTest, DBTTestResult logger = logging.getLogger(__name__) diff --git a/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_tests.py b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_tests.py new file mode 100644 index 00000000000000..721769d214d9e5 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_tests.py @@ -0,0 +1,261 @@ +import json +import re +from dataclasses import dataclass +from datetime import datetime +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Union + +from datahub.emitter import mce_builder +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionResultClass, + AssertionResultTypeClass, + AssertionRunEventClass, + AssertionRunStatusClass, + AssertionStdAggregationClass, + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, + AssertionTypeClass, + DatasetAssertionInfoClass, + DatasetAssertionScopeClass, +) + +if TYPE_CHECKING: + from datahub.ingestion.source.dbt.dbt_common import DBTNode + + +@dataclass +class DBTTest: + qualified_test_name: str + column_name: Optional[str] + kw_args: dict + + +@dataclass +class DBTTestResult: + invocation_id: str + + status: str + execution_time: datetime + + native_results: Dict[str, str] + + +def _get_name_for_relationship_test(kw_args: Dict[str, str]) -> Optional[str]: + """ + Try to produce a useful string for the name of a relationship constraint. + Return None if we fail to + """ + destination_ref = kw_args.get("to") + source_ref = kw_args.get("model") + column_name = kw_args.get("column_name") + dest_field_name = kw_args.get("field") + if not destination_ref or not source_ref or not column_name or not dest_field_name: + # base assertions are violated, bail early + return None + m = re.match(r"^ref\(\'(.*)\'\)$", destination_ref) + if m: + destination_table = m.group(1) + else: + destination_table = destination_ref + m = re.search(r"ref\(\'(.*)\'\)", source_ref) + if m: + source_table = m.group(1) + else: + source_table = source_ref + return f"{source_table}.{column_name} referential integrity to {destination_table}.{dest_field_name}" + + +@dataclass +class AssertionParams: + scope: Union[DatasetAssertionScopeClass, str] + operator: Union[AssertionStdOperatorClass, str] + aggregation: Union[AssertionStdAggregationClass, str] + parameters: Optional[Callable[[Dict[str, str]], AssertionStdParametersClass]] = None + logic_fn: Optional[Callable[[Dict[str, str]], Optional[str]]] = None + + +_DBT_TEST_NAME_TO_ASSERTION_MAP: Dict[str, AssertionParams] = { + "not_null": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.NOT_NULL, + aggregation=AssertionStdAggregationClass.IDENTITY, + ), + "unique": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.EQUAL_TO, + aggregation=AssertionStdAggregationClass.UNIQUE_PROPOTION, + parameters=lambda _: AssertionStdParametersClass( + value=AssertionStdParameterClass( + value="1.0", + type=AssertionStdParameterTypeClass.NUMBER, + ) + ), + ), + "accepted_values": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.IN, + aggregation=AssertionStdAggregationClass.IDENTITY, + parameters=lambda kw_args: AssertionStdParametersClass( + value=AssertionStdParameterClass( + value=json.dumps(kw_args.get("values")), + type=AssertionStdParameterTypeClass.SET, + ), + ), + ), + "relationships": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass._NATIVE_, + aggregation=AssertionStdAggregationClass.IDENTITY, + parameters=lambda kw_args: AssertionStdParametersClass( + value=AssertionStdParameterClass( + value=json.dumps(kw_args.get("values")), + type=AssertionStdParameterTypeClass.SET, + ), + ), + logic_fn=_get_name_for_relationship_test, + ), + "dbt_expectations.expect_column_values_to_not_be_null": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.NOT_NULL, + aggregation=AssertionStdAggregationClass.IDENTITY, + ), + "dbt_expectations.expect_column_values_to_be_between": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.BETWEEN, + aggregation=AssertionStdAggregationClass.IDENTITY, + parameters=lambda x: AssertionStdParametersClass( + minValue=AssertionStdParameterClass( + value=str(x.get("min_value", "unknown")), + type=AssertionStdParameterTypeClass.NUMBER, + ), + maxValue=AssertionStdParameterClass( + value=str(x.get("max_value", "unknown")), + type=AssertionStdParameterTypeClass.NUMBER, + ), + ), + ), + "dbt_expectations.expect_column_values_to_be_in_set": AssertionParams( + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass.IN, + aggregation=AssertionStdAggregationClass.IDENTITY, + parameters=lambda kw_args: AssertionStdParametersClass( + value=AssertionStdParameterClass( + value=json.dumps(kw_args.get("value_set")), + type=AssertionStdParameterTypeClass.SET, + ), + ), + ), +} + + +def _string_map(input_map: Dict[str, Any]) -> Dict[str, str]: + return {k: str(v) for k, v in input_map.items()} + + +def make_assertion_from_test( + extra_custom_props: Dict[str, str], + node: "DBTNode", + assertion_urn: str, + upstream_urn: str, +) -> MetadataWorkUnit: + assert node.test_info + qualified_test_name = node.test_info.qualified_test_name + column_name = node.test_info.column_name + kw_args = node.test_info.kw_args + + if qualified_test_name in _DBT_TEST_NAME_TO_ASSERTION_MAP: + assertion_params = _DBT_TEST_NAME_TO_ASSERTION_MAP[qualified_test_name] + assertion_info = AssertionInfoClass( + type=AssertionTypeClass.DATASET, + customProperties=extra_custom_props, + datasetAssertion=DatasetAssertionInfoClass( + dataset=upstream_urn, + scope=assertion_params.scope, + operator=assertion_params.operator, + fields=[mce_builder.make_schema_field_urn(upstream_urn, column_name)] + if ( + assertion_params.scope == DatasetAssertionScopeClass.DATASET_COLUMN + and column_name + ) + else [], + nativeType=node.name, + aggregation=assertion_params.aggregation, + parameters=assertion_params.parameters(kw_args) + if assertion_params.parameters + else None, + logic=assertion_params.logic_fn(kw_args) + if assertion_params.logic_fn + else None, + nativeParameters=_string_map(kw_args), + ), + ) + elif column_name: + # no match with known test types, column-level test + assertion_info = AssertionInfoClass( + type=AssertionTypeClass.DATASET, + customProperties=extra_custom_props, + datasetAssertion=DatasetAssertionInfoClass( + dataset=upstream_urn, + scope=DatasetAssertionScopeClass.DATASET_COLUMN, + operator=AssertionStdOperatorClass._NATIVE_, + fields=[mce_builder.make_schema_field_urn(upstream_urn, column_name)], + nativeType=node.name, + logic=node.compiled_code or node.raw_code, + aggregation=AssertionStdAggregationClass._NATIVE_, + nativeParameters=_string_map(kw_args), + ), + ) + else: + # no match with known test types, default to row-level test + assertion_info = AssertionInfoClass( + type=AssertionTypeClass.DATASET, + customProperties=extra_custom_props, + datasetAssertion=DatasetAssertionInfoClass( + dataset=upstream_urn, + scope=DatasetAssertionScopeClass.DATASET_ROWS, + operator=AssertionStdOperatorClass._NATIVE_, + logic=node.compiled_code or node.raw_code, + nativeType=node.name, + aggregation=AssertionStdAggregationClass._NATIVE_, + nativeParameters=_string_map(kw_args), + ), + ) + + return MetadataChangeProposalWrapper( + entityUrn=assertion_urn, + aspect=assertion_info, + ).as_workunit() + + +def make_assertion_result_from_test( + node: "DBTNode", + assertion_urn: str, + upstream_urn: str, + test_warnings_are_errors: bool, +) -> MetadataWorkUnit: + assert node.test_result + test_result = node.test_result + + assertionResult = AssertionRunEventClass( + timestampMillis=int(test_result.execution_time.timestamp() * 1000.0), + assertionUrn=assertion_urn, + asserteeUrn=upstream_urn, + runId=test_result.invocation_id, + result=AssertionResultClass( + type=AssertionResultTypeClass.SUCCESS + if test_result.status == "pass" + or (not test_warnings_are_errors and test_result.status == "warn") + else AssertionResultTypeClass.FAILURE, + nativeResults=test_result.native_results, + ), + status=AssertionRunStatusClass.COMPLETE, + ) + + return MetadataChangeProposalWrapper( + entityUrn=assertion_urn, + aspect=assertionResult, + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py index 6b7c118373673b..d7f3dfb9279fbb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/dynamodb/dynamodb.py @@ -1,5 +1,5 @@ import logging -from dataclasses import field +from dataclasses import dataclass, field from typing import Any, Counter, Dict, Iterable, List, Optional, Type, Union import boto3 @@ -79,12 +79,13 @@ class DynamoDBConfig(DatasetSourceConfigMixin, StatefulIngestionConfigBase): table_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), - description="regex patterns for tables to filter in ingestion.", + description="Regex patterns for tables to filter in ingestion. The table name format is 'region.table'", ) # Custom Stateful Ingestion settings stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None +@dataclass class DynamoDBSourceReport(StaleEntityRemovalSourceReport): filtered: List[str] = field(default_factory=list) @@ -175,39 +176,30 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: # traverse databases in sorted order so output is consistent for region in dynamodb_regions: - try: - # create a new dynamodb client for each region, - # it seems for one client we could only list the table of one specific region, - # the list_tables() method don't take any config that related to region - # TODO: list table returns maximum number 100, need to implement pagination here - dynamodb_client = boto3.client( - "dynamodb", - region_name=region, - aws_access_key_id=self.config.aws_access_key_id - if self.config.aws_access_key_id - else None, - aws_secret_access_key=self.config.aws_secret_access_key.get_secret_value() - if self.config.aws_secret_access_key - else None, - ) - table_names: List[str] = dynamodb_client.list_tables()["TableNames"] - except Exception as ex: - # TODO: If regions is config input then this would be self.report.report_warning, - # we can create dynamodb client to take aws region or regions as user input - logger.info(f"exception happen in region {region}, skipping: {ex}") - continue - for table_name in sorted(table_names): - if not self.config.table_pattern.allowed(table_name): + logger.info(f"Processing region {region}") + # create a new dynamodb client for each region, + # it seems for one client we could only list the table of one specific region, + # the list_tables() method don't take any config that related to region + dynamodb_client = boto3.client( + "dynamodb", + region_name=region, + aws_access_key_id=self.config.aws_access_key_id, + aws_secret_access_key=self.config.aws_secret_access_key.get_secret_value(), + ) + + for table_name in self._list_tables(dynamodb_client): + dataset_name = f"{region}.{table_name}" + if not self.config.table_pattern.allowed(dataset_name): + logger.debug(f"skipping table: {dataset_name}") + self.report.report_dropped(dataset_name) continue + + logger.debug(f"Processing table: {dataset_name}") table_info = dynamodb_client.describe_table(TableName=table_name)[ "Table" ] account_id = table_info["TableArn"].split(":")[4] - if not self.config.table_pattern.allowed(table_name): - self.report.report_dropped(table_name) - continue platform_instance = self.config.platform_instance or account_id - dataset_name = f"{region}.{table_name}" dataset_urn = make_dataset_urn_with_platform_instance( platform=self.platform, platform_instance=platform_instance, @@ -222,7 +214,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) primary_key_dict = self.extract_primary_key_from_key_schema(table_info) table_schema = self.construct_schema_from_dynamodb( - dynamodb_client, table_name + dynamodb_client, region, table_name ) schema_metadata = self.construct_schema_metadata( table_name, @@ -254,9 +246,25 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: aspect=platform_instance_aspect, ).as_workunit() + def _list_tables( + self, + dynamodb_client: BaseClient, + ) -> Iterable[str]: + # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/dynamodb/paginator/ListTables.html + try: + for page in dynamodb_client.get_paginator("list_tables").paginate(): + table_names = page.get("TableNames") + if table_names: + yield from table_names + except Exception as ex: + # TODO: If regions is config input then this would be self.report.report_warning, + # we can create dynamodb client to take aws region or regions as user input + logger.info(f"Exception happened while listing tables, skipping: {ex}") + def construct_schema_from_dynamodb( self, dynamodb_client: BaseClient, + region: str, table_name: str, ) -> Dict[str, SchemaDescription]: """ @@ -275,7 +283,7 @@ def construct_schema_from_dynamodb( The MaxItems is the total number of items to return, and PageSize is the size of each page, we are assigning same value to these two config. If MaxItems is more than PageSize then we expect MaxItems / PageSize pages in response_iterator will return """ - self.include_table_item_to_schema(dynamodb_client, table_name, schema) + self.include_table_item_to_schema(dynamodb_client, region, table_name, schema) response_iterator = paginator.paginate( TableName=table_name, PaginationConfig={ @@ -294,33 +302,38 @@ def construct_schema_from_dynamodb( def include_table_item_to_schema( self, dynamodb_client: Any, + region: str, table_name: str, schema: Dict[str, SchemaDescription], ) -> None: """ - It will look up in the config include_table_item dict to see if the current table name exists as key, + It will look up in the config include_table_item dict to see if "region.table_name" exists as key, if it exists then get the items by primary key from the table and put it to schema """ if self.config.include_table_item is None: return - if table_name not in self.config.include_table_item.keys(): + dataset_name = f"{region}.{table_name}" + if dataset_name not in self.config.include_table_item.keys(): return - primary_key_list = self.config.include_table_item.get(table_name) + primary_key_list = self.config.include_table_item.get(dataset_name) assert isinstance(primary_key_list, List) if len(primary_key_list) > MAX_PRIMARY_KEYS_SIZE: logger.info( - f"the provided primary keys list size exceeded the max size for table {table_name}, we'll only process the first {MAX_PRIMARY_KEYS_SIZE} items" + f"the provided primary keys list size exceeded the max size for table {dataset_name}, we'll only process the first {MAX_PRIMARY_KEYS_SIZE} items" ) primary_key_list = primary_key_list[0:MAX_PRIMARY_KEYS_SIZE] items = [] response = dynamodb_client.batch_get_item( RequestItems={table_name: {"Keys": primary_key_list}} - ).get("Responses", None) + ).get("Responses") if response is None: logger.error( f"failed to retrieve item from table {table_name} by the given key {primary_key_list}" ) return + logger.debug( + f"successfully retrieved {len(primary_key_list)} items based on supplied primary key list" + ) items = response.get(table_name) self.construct_schema_from_items(items, schema) diff --git a/metadata-ingestion/src/datahub/ingestion/source/file.py b/metadata-ingestion/src/datahub/ingestion/source/file.py index de61fa8481c589..590aa59f7b5b6e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/file.py +++ b/metadata-ingestion/src/datahub/ingestion/source/file.py @@ -16,7 +16,7 @@ from pydantic.fields import Field from datahub.configuration.common import ConfigEnum, ConfigModel, ConfigurationError -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py index 01e083d566168d..c334a97680e3e4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_data_profiler.py @@ -273,6 +273,7 @@ class _SingleDatasetProfiler(BasicDatasetProfilerBase): partition: Optional[str] config: GEProfilingConfig report: SQLSourceReport + custom_sql: Optional[str] query_combiner: SQLAlchemyQueryCombiner @@ -405,22 +406,52 @@ def _get_dataset_rows(self, dataset_profile: DatasetProfileClass) -> None: def _get_dataset_column_min( self, column_profile: DatasetFieldProfileClass, column: str ) -> None: - if self.config.include_field_min_value: + if not self.config.include_field_min_value: + return + try: column_profile.min = str(self.dataset.get_column_min(column)) + except Exception as e: + logger.debug( + f"Caught exception while attempting to get column min for column {column}. {e}" + ) + self.report.report_warning( + "Profiling - Unable to get column min", + f"{self.dataset_name}.{column}", + ) @_run_with_query_combiner def _get_dataset_column_max( self, column_profile: DatasetFieldProfileClass, column: str ) -> None: - if self.config.include_field_max_value: + if not self.config.include_field_max_value: + return + try: column_profile.max = str(self.dataset.get_column_max(column)) + except Exception as e: + logger.debug( + f"Caught exception while attempting to get column max for column {column}. {e}" + ) + self.report.report_warning( + "Profiling - Unable to get column max", + f"{self.dataset_name}.{column}", + ) @_run_with_query_combiner def _get_dataset_column_mean( self, column_profile: DatasetFieldProfileClass, column: str ) -> None: - if self.config.include_field_mean_value: + if not self.config.include_field_mean_value: + return + try: column_profile.mean = str(self.dataset.get_column_mean(column)) + except Exception as e: + logger.debug( + f"Caught exception while attempting to get column mean for column {column}. {e}" + ) + self.report.report_warning( + "Profiling - Unable to get column mean", + f"{self.dataset_name}.{column}", + ) @_run_with_query_combiner def _get_dataset_column_median( @@ -596,16 +627,8 @@ def generate_dataset_profile( # noqa: C901 (complexity) "catch_exceptions", self.config.catch_exceptions ) - profile = DatasetProfileClass(timestampMillis=get_sys_time()) - if self.partition: - profile.partitionSpec = PartitionSpecClass(partition=self.partition) - elif self.config.limit and self.config.offset: - profile.partitionSpec = PartitionSpecClass( - type=PartitionTypeClass.QUERY, - partition=json.dumps( - dict(limit=self.config.limit, offset=self.config.offset) - ), - ) + profile = self.init_profile() + profile.fieldProfiles = [] self._get_dataset_rows(profile) @@ -636,7 +659,16 @@ def generate_dataset_profile( # noqa: C901 (complexity) self.query_combiner.flush() assert profile.rowCount is not None - row_count: int = profile.rowCount + row_count: int # used for null counts calculation + if profile.partitionSpec and "SAMPLE" in profile.partitionSpec.partition: + # We can alternatively use `self._get_dataset_rows(profile)` to get + # exact count of rows in sample, as actual rows involved in sample + # may be slightly different (more or less) than configured `sample_size`. + # However not doing so to start with, as that adds another query overhead + # plus approximate metrics should work for sampling based profiling. + row_count = self.config.sample_size + else: + row_count = profile.rowCount for column_spec in columns_profiling_queue: column = column_spec.column @@ -740,6 +772,24 @@ def generate_dataset_profile( # noqa: C901 (complexity) self.query_combiner.flush() return profile + def init_profile(self): + profile = DatasetProfileClass(timestampMillis=get_sys_time()) + if self.partition: + profile.partitionSpec = PartitionSpecClass(partition=self.partition) + elif self.config.limit: + profile.partitionSpec = PartitionSpecClass( + type=PartitionTypeClass.QUERY, + partition=json.dumps( + dict(limit=self.config.limit, offset=self.config.offset) + ), + ) + elif self.custom_sql: + profile.partitionSpec = PartitionSpecClass( + type=PartitionTypeClass.QUERY, partition="SAMPLE" + ) + + return profile + def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> None: if ( self.dataset.engine.dialect.name.lower() == BIGQUERY @@ -770,7 +820,7 @@ def update_dataset_batch_use_sampling(self, profile: DatasetProfileClass) -> Non sample_pc = 100 * self.config.sample_size / profile.rowCount sql = ( f"SELECT * FROM {str(self.dataset._table)} " - + f"TABLESAMPLE SYSTEM ({sample_pc:.3f} percent)" + + f"TABLESAMPLE SYSTEM ({sample_pc:.8f} percent)" ) temp_table_name = create_bigquery_temp_table( self, @@ -1064,6 +1114,7 @@ def _generate_single_profile( partition, self.config, self.report, + custom_sql, query_combiner, ).generate_dataset_profile() diff --git a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py index 77761c529ba0b1..24a3e520d8caff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/ge_profiling_config.py @@ -157,12 +157,12 @@ class GEProfilingConfig(ConfigModel): ) use_sampling: bool = Field( default=True, - description="Whether to profile column level stats on sample of table. Only BigQuery supports this. " + description="Whether to profile column level stats on sample of table. Only BigQuery and Snowflake support this. " "If enabled, profiling is done on rows sampled from table. Sampling is not done for smaller tables. ", ) sample_size: int = Field( - default=1000, + default=10000, description="Number of rows to be sampled from table for column level profiling." "Applicable only if `use_sampling` is set to True.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka.py b/metadata-ingestion/src/datahub/ingestion/source/kafka.py index 566304e1999b79..23770ff3cf8122 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka.py @@ -3,7 +3,7 @@ import logging from dataclasses import dataclass, field from enum import Enum -from typing import Any, Dict, Iterable, List, Optional, Type +from typing import Any, Dict, Iterable, List, Optional, Type, cast import avro.schema import confluent_kafka @@ -18,7 +18,10 @@ from datahub.configuration.common import AllowDenyPattern from datahub.configuration.kafka import KafkaConsumerConnectionConfig -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import ( + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, +) from datahub.emitter import mce_builder from datahub.emitter.mce_builder import ( make_data_platform_urn, @@ -76,7 +79,11 @@ class KafkaTopicConfigKeys(str, Enum): UNCLEAN_LEADER_ELECTION_CONFIG = "unclean.leader.election.enable" -class KafkaSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): +class KafkaSourceConfig( + StatefulIngestionConfigBase, + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, +): connection: KafkaConsumerConnectionConfig = KafkaConsumerConnectionConfig() topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["^_.*"]) @@ -309,13 +316,20 @@ def _extract_record( avro_schema = avro.schema.parse( schema_metadata.platformSchema.documentSchema ) - description = avro_schema.doc + description = getattr(avro_schema, "doc", None) # set the tags all_tags: List[str] = [] - for tag in avro_schema.other_props.get( - self.source_config.schema_tags_field, [] - ): - all_tags.append(self.source_config.tag_prefix + tag) + try: + schema_tags = cast( + Iterable[str], + avro_schema.other_props.get( + self.source_config.schema_tags_field, [] + ), + ) + for tag in schema_tags: + all_tags.append(self.source_config.tag_prefix + tag) + except TypeError: + pass if self.source_config.enable_meta_mapping: meta_aspects = self.meta_processor.process(avro_schema.other_props) diff --git a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py index f3344782917abf..1a1e012e806333 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py +++ b/metadata-ingestion/src/datahub/ingestion/source/kafka_connect.py @@ -28,7 +28,9 @@ ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql.sql_common import get_platform_from_sqlalchemy_uri +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -1094,6 +1096,7 @@ def transform_connector_config( @config_class(KafkaConnectSourceConfig) @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") class KafkaConnectSource(StatefulIngestionSourceBase): config: KafkaConnectSourceConfig report: KafkaConnectSourceReport diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py index 89b1e45695c578..30c38720dd96c4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_common.py @@ -81,9 +81,6 @@ EnumTypeClass, FineGrainedLineageClass, GlobalTagsClass, - OwnerClass, - OwnershipClass, - OwnershipTypeClass, SchemaMetadataClass, StatusClass, SubTypesClass, @@ -453,17 +450,9 @@ def _get_schema( @staticmethod def _get_tag_mce_for_urn(tag_urn: str) -> MetadataChangeEvent: assert tag_urn in LookerUtil.tag_definitions - ownership = OwnershipClass( - owners=[ - OwnerClass( - owner="urn:li:corpuser:datahub", - type=OwnershipTypeClass.DATAOWNER, - ) - ] - ) return MetadataChangeEvent( proposedSnapshot=TagSnapshotClass( - urn=tag_urn, aspects=[ownership, LookerUtil.tag_definitions[tag_urn]] + urn=tag_urn, aspects=[LookerUtil.tag_definitions[tag_urn]] ) ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py index cf132b7ef27f76..b00f74b71e7922 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py @@ -123,8 +123,12 @@ def get_user(self, id_: str, user_fields: str) -> Optional[User]: transport_options=self.transport_options, ) except SDKError as e: - logger.warning(f"Could not find user with id {id_}") - logger.warning(f"Failure was {e}") + if "Looker Not Found (404)" in str(e): + # User not found + logger.info(f"Could not find user with id {id_}: 404 error") + else: + logger.warning(f"Could not find user with id {id_}") + logger.warning(f"Failure was {e}") # User not found return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py index 8297a0aa8efa7e..09683d790c14c7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/looker_source.py @@ -103,6 +103,11 @@ @capability( SourceCapability.OWNERSHIP, "Enabled by default, configured using `extract_owners`" ) +@capability(SourceCapability.LINEAGE_COARSE, "Supported by default") +@capability( + SourceCapability.LINEAGE_FINE, + "Enabled by default, configured using `extract_column_level_lineage`", +) @capability( SourceCapability.USAGE_STATS, "Enabled by default, configured using `extract_usage_history`", @@ -921,14 +926,7 @@ def process_metrics_dimensions_and_fields_for_dashboard( mcps = chart_mcps mcps.append(dashboard_mcp) - workunits = [ - MetadataWorkUnit( - id=f"looker-{mcp.aspectName}-{mcp.entityUrn}", - mcp=mcp, - treat_errors_as_warnings=True, - ) - for mcp in mcps - ] + workunits = [mcp.as_workunit() for mcp in mcps] return workunits @@ -1128,7 +1126,6 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: def emit_independent_looks_mcp( self, dashboard_element: LookerDashboardElement ) -> Iterable[MetadataWorkUnit]: - yield from auto_workunit( stream=self._make_chart_metadata_events( dashboard_element=dashboard_element, @@ -1316,10 +1313,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: id=f"looker-{event.proposedSnapshot.urn}", mce=event ) elif isinstance(event, MetadataChangeProposalWrapper): - # We want to treat subtype aspects as optional, so allowing failures in this aspect to be treated as warnings rather than failures - yield event.as_workunit( - treat_errors_as_warnings=event.aspectName in ["subTypes"] - ) + yield event.as_workunit() else: raise Exception(f"Unexpected type of event {event}") self.reporter.report_stage_end("explore_metadata") diff --git a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py index e69c3b6e601bd3..e6b78cc7a77450 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py @@ -2171,10 +2171,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901 for mcp in self._build_dataset_mcps( maybe_looker_view ): - # We want to treat mcp aspects as optional, so allowing failures in this aspect to be treated as warnings rather than failures - yield mcp.as_workunit( - treat_errors_as_warnings=True - ) + yield mcp.as_workunit() else: ( prev_model_name, diff --git a/metadata-ingestion/src/datahub/ingestion/source/metabase.py b/metadata-ingestion/src/datahub/ingestion/source/metabase.py index fb4512893feb1c..24145d60210ff0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metabase.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metabase.py @@ -80,6 +80,7 @@ def remove_trailing_slash(cls, v): @config_class(MetabaseConfig) @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Supported by default") class MetabaseSource(Source): """ This plugin extracts Charts, dashboards, and associated metadata. This plugin is in beta and has only been tested diff --git a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py index 1c0c809c16a60e..f33c6e0edae3dc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/metadata/lineage.py @@ -23,11 +23,17 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, + capability, config_class, platform_name, support_status, ) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.source import ( + MetadataWorkUnitProcessor, + Source, + SourceCapability, + SourceReport, +) from datahub.ingestion.api.source_helpers import ( auto_status_aspect, auto_workunit_reporter, @@ -121,6 +127,8 @@ def version_must_be_1(cls, v): @platform_name("File Based Lineage") @config_class(LineageFileSourceConfig) @support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.LINEAGE_COARSE, "Specified in the lineage file.") +@capability(SourceCapability.LINEAGE_FINE, "Specified in the lineage file.") @dataclass class LineageFileSource(Source): """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/mode.py b/metadata-ingestion/src/datahub/ingestion/source/mode.py index a000c66a406c20..c46b56da422d96 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mode.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mode.py @@ -98,6 +98,7 @@ class HTTPError429(HTTPError): @config_class(ModeConfig) @support_status(SupportStatus.CERTIFIED) @capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Supported by default") class ModeSource(Source): """ diff --git a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py index f02b6845e40b5f..ce2b9ce2981e09 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/mongodb.py +++ b/metadata-ingestion/src/datahub/ingestion/source/mongodb.py @@ -11,7 +11,16 @@ from pymongo.mongo_client import MongoClient from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.source_common import EnvConfigMixin +from datahub.configuration.source_common import ( + EnvConfigMixin, + PlatformInstanceConfigMixin, +) +from datahub.emitter.mce_builder import ( + make_data_platform_urn, + make_dataplatform_instance_urn, + make_dataset_urn_with_platform_instance, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SourceCapability, @@ -21,14 +30,21 @@ platform_name, support_status, ) -from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.schema_inference.object import ( SchemaDescription, construct_schema, ) -from datahub.metadata.com.linkedin.pegasus2avro.metadata.snapshot import DatasetSnapshot -from datahub.metadata.com.linkedin.pegasus2avro.mxe import MetadataChangeEvent +from datahub.ingestion.source.state.stale_entity_removal_handler import ( + StaleEntityRemovalHandler, + StaleEntityRemovalSourceReport, + StatefulIngestionConfigBase, + StatefulStaleMetadataRemovalConfig, +) +from datahub.ingestion.source.state.stateful_ingestion_base import ( + StatefulIngestionSourceBase, +) from datahub.metadata.com.linkedin.pegasus2avro.schema import ( ArrayTypeClass, BooleanTypeClass, @@ -44,7 +60,10 @@ TimeTypeClass, UnionTypeClass, ) -from datahub.metadata.schema_classes import DatasetPropertiesClass +from datahub.metadata.schema_classes import ( + DataPlatformInstanceClass, + DatasetPropertiesClass, +) logger = logging.getLogger(__name__) @@ -55,7 +74,9 @@ DENY_DATABASE_LIST = set(["admin", "config", "local"]) -class MongoDBConfig(EnvConfigMixin): +class MongoDBConfig( + PlatformInstanceConfigMixin, EnvConfigMixin, StatefulIngestionConfigBase +): # See the MongoDB authentication docs for details and examples. # https://pymongo.readthedocs.io/en/stable/examples/authentication.html connect_uri: str = Field( @@ -95,6 +116,8 @@ class MongoDBConfig(EnvConfigMixin): default=AllowDenyPattern.allow_all(), description="regex patterns for collections to filter in ingestion.", ) + # Custom Stateful Ingestion settings + stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None @validator("maxDocumentSize") def check_max_doc_size_filter_is_valid(cls, doc_size_filter_value): @@ -104,7 +127,7 @@ def check_max_doc_size_filter_is_valid(cls, doc_size_filter_value): @dataclass -class MongoDBSourceReport(SourceReport): +class MongoDBSourceReport(StaleEntityRemovalSourceReport): filtered: List[str] = field(default_factory=list) def report_dropped(self, name: str) -> None: @@ -125,6 +148,7 @@ def report_dropped(self, name: str) -> None: bson.timestamp.Timestamp: "timestamp", bson.dbref.DBRef: "dbref", bson.objectid.ObjectId: "oid", + bson.Decimal128: "numberDecimal", "mixed": "mixed", } @@ -141,6 +165,7 @@ def report_dropped(self, name: str) -> None: bson.timestamp.Timestamp: TimeTypeClass, bson.dbref.DBRef: BytesTypeClass, bson.objectid.ObjectId: BytesTypeClass, + bson.Decimal128: NumberTypeClass, dict: RecordTypeClass, "mixed": UnionTypeClass, } @@ -199,9 +224,10 @@ def construct_schema_pymongo( @platform_name("MongoDB") @config_class(MongoDBConfig) @support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") @capability(SourceCapability.SCHEMA_METADATA, "Enabled by default") @dataclass -class MongoDBSource(Source): +class MongoDBSource(StatefulIngestionSourceBase): """ This plugin extracts the following: @@ -222,7 +248,7 @@ class MongoDBSource(Source): mongo_client: MongoClient def __init__(self, ctx: PipelineContext, config: MongoDBConfig): - super().__init__(ctx) + super().__init__(config, ctx) self.config = config self.report = MongoDBSourceReport() @@ -249,6 +275,14 @@ def create(cls, config_dict: dict, ctx: PipelineContext) -> "MongoDBSource": config = MongoDBConfig.parse_obj(config_dict) return cls(ctx, config) + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: + return [ + *super().get_workunit_processors(), + StaleEntityRemovalHandler.create( + self, self.config, self.ctx + ).workunit_processor, + ] + def get_pymongo_type_string( self, field_type: Union[Type, str], collection_name: str ) -> str: @@ -320,18 +354,25 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.report.report_dropped(dataset_name) continue - dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.config.env})" - - dataset_snapshot = DatasetSnapshot( - urn=dataset_urn, - aspects=[], + dataset_urn = make_dataset_urn_with_platform_instance( + platform=platform, + name=dataset_name, + env=self.config.env, + platform_instance=self.config.platform_instance, ) + if self.config.platform_instance: + data_platform_instance = DataPlatformInstanceClass( + platform=make_data_platform_urn(platform), + instance=make_dataplatform_instance_urn( + platform, self.config.platform_instance + ), + ) + dataset_properties = DatasetPropertiesClass( tags=[], customProperties={}, ) - dataset_snapshot.aspects.append(dataset_properties) if self.config.enableSchemaInference: assert self.config.maxDocumentSize is not None @@ -402,13 +443,20 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: fields=canonical_schema, ) - dataset_snapshot.aspects.append(schema_metadata) - # TODO: use list_indexes() or index_information() to get index information # See https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.list_indexes. - mce = MetadataChangeEvent(proposedSnapshot=dataset_snapshot) - yield MetadataWorkUnit(id=dataset_name, mce=mce) + yield from [ + mcp.as_workunit() + for mcp in MetadataChangeProposalWrapper.construct_many( + entityUrn=dataset_urn, + aspects=[ + schema_metadata, + dataset_properties, + data_platform_instance, + ], + ) + ] def is_server_version_gte_4_4(self) -> bool: try: diff --git a/metadata-ingestion/src/datahub/ingestion/source/nifi.py b/metadata-ingestion/src/datahub/ingestion/source/nifi.py index ac1e03812db3bf..bc05edbb3c623a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/nifi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/nifi.py @@ -26,11 +26,12 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, + capability, config_class, platform_name, support_status, ) -from datahub.ingestion.api.source import Source, SourceReport +from datahub.ingestion.api.source import Source, SourceCapability, SourceReport from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.metadata.schema_classes import ( DataFlowInfoClass, @@ -360,6 +361,7 @@ def report_dropped(self, ent_name: str) -> None: @platform_name("NiFi", id="nifi") @config_class(NifiSourceConfig) @support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.LINEAGE_COARSE, "Supported. See docs for limitations") class NifiSource(Source): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py index ffa685fb258267..96729f4c60c6c4 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py @@ -9,8 +9,8 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.common.subtypes import BIAssetSubTypes from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, @@ -397,6 +397,42 @@ class PowerBiDashboardSourceConfig( "as this option generates the upstream datasets URN in lowercase.", ) + # Enable CLL extraction + extract_column_level_lineage: bool = pydantic.Field( + default=False, + description="Whether to extract column level lineage. " + "Works only if configs `native_query_parsing`, `enable_advance_lineage_sql_construct` & `extract_lineage` are enabled. " + "Works for M-Query where native SQL is used for transformation.", + ) + + @root_validator + @classmethod + def validate_extract_column_level_lineage(cls, values: Dict) -> Dict: + flags = [ + "native_query_parsing", + "enable_advance_lineage_sql_construct", + "extract_lineage", + ] + + if ( + "extract_column_level_lineage" in values + and values["extract_column_level_lineage"] is False + ): + # Flag is not set. skip validation + return values + + logger.debug(f"Validating additional flags: {flags}") + + is_flag_enabled: bool = True + for flag in flags: + if flag not in values or values[flag] is False: + is_flag_enabled = False + + if not is_flag_enabled: + raise ValueError(f"Enable all these flags in recipe: {flags} ") + + return values + @validator("dataset_type_mapping") @classmethod def map_data_platform(cls, value): diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py index 021c429c3c6333..0afa8e7ff4564e 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py @@ -9,7 +9,7 @@ SPECIAL_CHARACTERS = ["#(lf)", "(lf)"] -logger = logging.getLogger() +logger = logging.getLogger(__name__) def remove_special_characters(native_query: str) -> str: @@ -21,7 +21,7 @@ def remove_special_characters(native_query: str) -> str: def get_tables(native_query: str) -> List[str]: native_query = remove_special_characters(native_query) - logger.debug(f"Processing query = {native_query}") + logger.debug(f"Processing native query = {native_query}") tables: List[str] = [] parsed = sqlparse.parse(native_query)[0] tokens: List[sqlparse.sql.Token] = list(parsed.tokens) @@ -65,7 +65,7 @@ def parse_custom_sql( sql_query = remove_special_characters(query) - logger.debug(f"Parsing sql={sql_query}") + logger.debug(f"Processing native query = {sql_query}") return sqlglot_l.create_lineage_sql_parsed_result( query=sql_query, diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py index 8cc38c366c42a4..9134932c39fe0d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py @@ -56,7 +56,7 @@ def get_upstream_tables( ctx: PipelineContext, config: PowerBiDashboardSourceConfig, parameters: Dict[str, str] = {}, -) -> List[resolver.DataPlatformTable]: +) -> List[resolver.Lineage]: if table.expression is None: logger.debug(f"Expression is none for table {table.full_name}") return [] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py index 479f1decff903d..e200ff41f71c25 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py @@ -27,7 +27,7 @@ IdentifierAccessor, ) from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table -from datahub.utilities.sqlglot_lineage import SqlParsingResult +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult logger = logging.getLogger(__name__) @@ -38,6 +38,16 @@ class DataPlatformTable: urn: str +@dataclass +class Lineage: + upstreams: List[DataPlatformTable] + column_lineage: List[ColumnLineageInfo] + + @staticmethod + def empty() -> "Lineage": + return Lineage(upstreams=[], column_lineage=[]) + + def urn_to_lowercase(value: str, flag: bool) -> str: if flag is True: return value.lower() @@ -120,9 +130,9 @@ def __init__( self.platform_instance_resolver = platform_instance_resolver @abstractmethod - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: pass @abstractmethod @@ -147,7 +157,7 @@ def get_db_detail_from_argument( def parse_custom_sql( self, query: str, server: str, database: Optional[str], schema: Optional[str] - ) -> List[DataPlatformTable]: + ) -> Lineage: dataplatform_tables: List[DataPlatformTable] = [] @@ -174,7 +184,7 @@ def parse_custom_sql( if parsed_result is None: logger.debug("Failed to parse query") - return dataplatform_tables + return Lineage.empty() for urn in parsed_result.in_tables: dataplatform_tables.append( @@ -184,9 +194,15 @@ def parse_custom_sql( ) ) + logger.debug(f"Native Query parsed result={parsed_result}") logger.debug(f"Generated dataplatform_tables={dataplatform_tables}") - return dataplatform_tables + return Lineage( + upstreams=dataplatform_tables, + column_lineage=parsed_result.column_lineage + if parsed_result.column_lineage is not None + else [], + ) class AbstractDataAccessMQueryResolver(ABC): @@ -215,7 +231,7 @@ def resolve_to_data_platform_table_list( ctx: PipelineContext, config: PowerBiDashboardSourceConfig, platform_instance_resolver: AbstractDataPlatformInstanceResolver, - ) -> List[DataPlatformTable]: + ) -> List[Lineage]: pass @@ -471,8 +487,8 @@ def resolve_to_data_platform_table_list( ctx: PipelineContext, config: PowerBiDashboardSourceConfig, platform_instance_resolver: AbstractDataPlatformInstanceResolver, - ) -> List[DataPlatformTable]: - data_platform_tables: List[DataPlatformTable] = [] + ) -> List[Lineage]: + lineage: List[Lineage] = [] # Find out output variable as we are doing backtracking in M-Query output_variable: Optional[str] = tree_function.get_output_variable( @@ -484,7 +500,7 @@ def resolve_to_data_platform_table_list( f"{self.table.full_name}-output-variable", "output-variable not found in table expression", ) - return data_platform_tables + return lineage # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail table_links: List[ @@ -509,7 +525,7 @@ def resolve_to_data_platform_table_list( # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it # & also pass additional information that will be need to generate urn - table_full_name_creator: AbstractDataPlatformTableCreator = ( + table_qualified_name_creator: AbstractDataPlatformTableCreator = ( supported_resolver.get_table_full_name_creator()( ctx=ctx, config=config, @@ -517,11 +533,9 @@ def resolve_to_data_platform_table_list( ) ) - data_platform_tables.extend( - table_full_name_creator.create_dataplatform_tables(f_detail) - ) + lineage.append(table_qualified_name_creator.create_lineage(f_detail)) - return data_platform_tables + return lineage class DefaultTwoStepDataAccessSources(AbstractDataPlatformTableCreator, ABC): @@ -536,7 +550,7 @@ class DefaultTwoStepDataAccessSources(AbstractDataPlatformTableCreator, ABC): def two_level_access_pattern( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing {self.get_platform_pair().powerbi_data_platform_name} data-access function detail {data_access_func_detail}" ) @@ -545,7 +559,7 @@ def two_level_access_pattern( data_access_func_detail.arg_list ) if server is None or db_name is None: - return [] # Return empty list + return Lineage.empty() # Return empty list schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor @@ -568,19 +582,21 @@ def two_level_access_pattern( server=server, qualified_table_name=qualified_table_name, ) - - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) class PostgresDataPlatformTableCreator(DefaultTwoStepDataAccessSources): - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: return self.two_level_access_pattern(data_access_func_detail) def get_platform_pair(self) -> DataPlatformPair: @@ -630,10 +646,10 @@ def create_urn_using_old_parser( return dataplatform_tables - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: - dataplatform_tables: List[DataPlatformTable] = [] + ) -> Lineage: + arguments: List[str] = tree_function.strip_char_from_list( values=tree_function.remove_whitespaces_from_list( tree_function.token_values(data_access_func_detail.arg_list) @@ -647,14 +663,17 @@ def create_dataplatform_tables( if len(arguments) >= 4 and arguments[2] != "Query": logger.debug("Unsupported case is found. Second index is not the Query") - return dataplatform_tables + return Lineage.empty() if self.config.enable_advance_lineage_sql_construct is False: # Use previous parser to generate URN to keep backward compatibility - return self.create_urn_using_old_parser( - query=arguments[3], - db_name=arguments[1], - server=arguments[0], + return Lineage( + upstreams=self.create_urn_using_old_parser( + query=arguments[3], + db_name=arguments[1], + server=arguments[0], + ), + column_lineage=[], ) return self.parse_custom_sql( @@ -684,9 +703,9 @@ def _get_server_and_db_name(value: str) -> Tuple[Optional[str], Optional[str]]: return tree_function.strip_char_from_list([splitter_result[0]])[0], db_name - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing Oracle data-access function detail {data_access_func_detail}" ) @@ -698,7 +717,7 @@ def create_dataplatform_tables( server, db_name = self._get_server_and_db_name(arguments[0]) if db_name is None or server is None: - return [] + return Lineage.empty() schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor @@ -719,18 +738,21 @@ def create_dataplatform_tables( qualified_table_name=qualified_table_name, ) - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) class DatabrickDataPlatformTableCreator(AbstractDataPlatformTableCreator): - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing Databrick data-access function detail {data_access_func_detail}" ) @@ -749,7 +771,7 @@ def create_dataplatform_tables( logger.debug( "expecting instance to be IdentifierAccessor, please check if parsing is done properly" ) - return [] + return Lineage.empty() db_name: str = value_dict["Database"] schema_name: str = value_dict["Schema"] @@ -762,7 +784,7 @@ def create_dataplatform_tables( logger.info( f"server information is not available for {qualified_table_name}. Skipping upstream table" ) - return [] + return Lineage.empty() urn = urn_creator( config=self.config, @@ -772,12 +794,15 @@ def create_dataplatform_tables( qualified_table_name=qualified_table_name, ) - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.DATABRICK_SQL.value @@ -789,9 +814,9 @@ def get_datasource_server( ) -> str: return tree_function.strip_char_from_list([arguments[0]])[0] - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing {self.get_platform_pair().datahub_data_platform_name} function detail {data_access_func_detail}" ) @@ -826,12 +851,15 @@ def create_dataplatform_tables( qualified_table_name=qualified_table_name, ) - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) class SnowflakeDataPlatformTableCreator(DefaultThreeStepDataAccessSources): @@ -859,9 +887,9 @@ class AmazonRedshiftDataPlatformTableCreator(AbstractDataPlatformTableCreator): def get_platform_pair(self) -> DataPlatformPair: return SupportedDataPlatform.AMAZON_REDSHIFT.value - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: + ) -> Lineage: logger.debug( f"Processing AmazonRedshift data-access function detail {data_access_func_detail}" ) @@ -870,7 +898,7 @@ def create_dataplatform_tables( data_access_func_detail.arg_list ) if db_name is None or server is None: - return [] # Return empty list + return Lineage.empty() # Return empty list schema_name: str = cast( IdentifierAccessor, data_access_func_detail.identifier_accessor @@ -891,12 +919,15 @@ def create_dataplatform_tables( qualified_table_name=qualified_table_name, ) - return [ - DataPlatformTable( - data_platform_pair=self.get_platform_pair(), - urn=urn, - ) - ] + return Lineage( + upstreams=[ + DataPlatformTable( + data_platform_pair=self.get_platform_pair(), + urn=urn, + ) + ], + column_lineage=[], + ) class NativeQueryDataPlatformTableCreator(AbstractDataPlatformTableCreator): @@ -916,9 +947,7 @@ def is_native_parsing_supported(data_access_function_name: str) -> bool: in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM ) - def create_urn_using_old_parser( - self, query: str, server: str - ) -> List[DataPlatformTable]: + def create_urn_using_old_parser(self, query: str, server: str) -> Lineage: dataplatform_tables: List[DataPlatformTable] = [] tables: List[str] = native_sql_parser.get_tables(query) @@ -947,12 +976,14 @@ def create_urn_using_old_parser( logger.debug(f"Generated dataplatform_tables {dataplatform_tables}") - return dataplatform_tables + return Lineage( + upstreams=dataplatform_tables, + column_lineage=[], + ) - def create_dataplatform_tables( + def create_lineage( self, data_access_func_detail: DataAccessFunctionDetail - ) -> List[DataPlatformTable]: - dataplatform_tables: List[DataPlatformTable] = [] + ) -> Lineage: t1: Tree = cast( Tree, tree_function.first_arg_list_func(data_access_func_detail.arg_list) ) @@ -963,7 +994,7 @@ def create_dataplatform_tables( f"Expecting 2 argument, actual argument count is {len(flat_argument_list)}" ) logger.debug(f"Flat argument list = {flat_argument_list}") - return dataplatform_tables + return Lineage.empty() data_access_tokens: List[str] = tree_function.remove_whitespaces_from_list( tree_function.token_values(flat_argument_list[0]) ) @@ -981,7 +1012,7 @@ def create_dataplatform_tables( f"Server is not available in argument list for data-platform {data_access_tokens[0]}. Returning empty " "list" ) - return dataplatform_tables + return Lineage.empty() self.current_data_platform = self.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM[ data_access_tokens[0] diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py index 5d477ee090e7e6..4611a8eed47827 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py @@ -44,6 +44,11 @@ StatefulIngestionSourceBase, ) from datahub.metadata.com.linkedin.pegasus2avro.common import ChangeAuditStamps +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + FineGrainedLineage, + FineGrainedLineageDownstreamType, + FineGrainedLineageUpstreamType, +) from datahub.metadata.schema_classes import ( BrowsePathsClass, ChangeTypeClass, @@ -71,6 +76,7 @@ ViewPropertiesClass, ) from datahub.utilities.dedup_list import deduplicate_list +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo # Logger instance logger = logging.getLogger(__name__) @@ -165,6 +171,48 @@ def extract_dataset_schema( ) return [schema_mcp] + def make_fine_grained_lineage_class( + self, lineage: resolver.Lineage, dataset_urn: str + ) -> List[FineGrainedLineage]: + fine_grained_lineages: List[FineGrainedLineage] = [] + + if ( + self.__config.extract_column_level_lineage is False + or self.__config.extract_lineage is False + ): + return fine_grained_lineages + + if lineage is None: + return fine_grained_lineages + + logger.info("Extracting column level lineage") + + cll: List[ColumnLineageInfo] = lineage.column_lineage + + for cll_info in cll: + downstream = ( + [builder.make_schema_field_urn(dataset_urn, cll_info.downstream.column)] + if cll_info.downstream is not None + and cll_info.downstream.column is not None + else [] + ) + + upstreams = [ + builder.make_schema_field_urn(column_ref.table, column_ref.column) + for column_ref in cll_info.upstreams + ] + + fine_grained_lineages.append( + FineGrainedLineage( + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=downstream, + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=upstreams, + ) + ) + + return fine_grained_lineages + def extract_lineage( self, table: powerbi_data_classes.Table, ds_urn: str ) -> List[MetadataChangeProposalWrapper]: @@ -174,8 +222,9 @@ def extract_lineage( parameters = table.dataset.parameters if table.dataset else {} upstream: List[UpstreamClass] = [] + cll_lineage: List[FineGrainedLineage] = [] - upstream_dpts: List[resolver.DataPlatformTable] = parser.get_upstream_tables( + upstream_lineage: List[resolver.Lineage] = parser.get_upstream_tables( table=table, reporter=self.__reporter, platform_instance_resolver=self.__dataplatform_instance_resolver, @@ -185,34 +234,48 @@ def extract_lineage( ) logger.debug( - f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_dpts}" + f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_lineage}" ) - for upstream_dpt in upstream_dpts: - if ( - upstream_dpt.data_platform_pair.powerbi_data_platform_name - not in self.__config.dataset_type_mapping.keys() - ): - logger.debug( - f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", + for lineage in upstream_lineage: + for upstream_dpt in lineage.upstreams: + if ( + upstream_dpt.data_platform_pair.powerbi_data_platform_name + not in self.__config.dataset_type_mapping.keys() + ): + logger.debug( + f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping", + ) + continue + + upstream_table_class = UpstreamClass( + upstream_dpt.urn, + DatasetLineageTypeClass.TRANSFORMED, ) - continue - upstream_table_class = UpstreamClass( - upstream_dpt.urn, - DatasetLineageTypeClass.TRANSFORMED, - ) + upstream.append(upstream_table_class) - upstream.append(upstream_table_class) + # Add column level lineage if any + cll_lineage.extend( + self.make_fine_grained_lineage_class( + lineage=lineage, + dataset_urn=ds_urn, + ) + ) if len(upstream) > 0: - upstream_lineage = UpstreamLineageClass(upstreams=upstream) + upstream_lineage_class: UpstreamLineageClass = UpstreamLineageClass( + upstreams=upstream, + fineGrainedLineages=cll_lineage or None, + ) + logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}") + mcp = MetadataChangeProposalWrapper( entityType=Constant.DATASET, changeType=ChangeTypeClass.UPSERT, entityUrn=ds_urn, - aspect=upstream_lineage, + aspect=upstream_lineage_class, ) mcps.append(mcp) @@ -1075,6 +1138,14 @@ def report_to_datahub_work_units( SourceCapability.OWNERSHIP, "Disabled by default, configured using `extract_ownership`", ) +@capability( + SourceCapability.LINEAGE_COARSE, + "Enabled by default, configured using `extract_lineage`.", +) +@capability( + SourceCapability.LINEAGE_FINE, + "Disabled by default, configured using `extract_column_level_lineage`. ", +) class PowerBiDashboardSource(StatefulIngestionSourceBase): """ This plugin extracts the following: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py index 93850607e551e1..79b044841e0541 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/config.py @@ -7,8 +7,8 @@ from datahub.configuration import ConfigModel from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetLineageProviderConfigBase +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.data_lake_common.path_spec import PathSpec from datahub.ingestion.source.sql.postgres import BasePostgresConfig from datahub.ingestion.source.state.stateful_ingestion_base import ( @@ -132,6 +132,16 @@ class RedshiftConfig( description="Whether `schema_pattern` is matched against fully qualified schema name `.`.", ) + extract_column_level_lineage: bool = Field( + default=True, + description="Whether to extract column level lineage. This config works with rest-sink only.", + ) + + incremental_lineage: bool = Field( + default=False, + description="When enabled, emits lineage as incremental to existing lineage already in DataHub. When disabled, re-states lineage on each run. This config works with rest-sink only.", + ) + @root_validator(pre=True) def check_email_is_set_on_usage(cls, values): if values.get("include_usage_statistics"): diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py index bbe52b5d98ba36..c9ddfbe92ab2ab 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/lineage.py @@ -9,10 +9,12 @@ import humanfriendly import redshift_connector -from sqllineage.runner import LineageRunner +import datahub.emitter.mce_builder as builder +import datahub.utilities.sqlglot_lineage as sqlglot_l from datahub.emitter import mce_builder from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.aws.s3_util import strip_s3_prefix from datahub.ingestion.source.redshift.common import get_db_name from datahub.ingestion.source.redshift.config import LineageMode, RedshiftConfig @@ -28,13 +30,19 @@ from datahub.ingestion.source.state.redundant_run_skip_handler import ( RedundantLineageRunSkipHandler, ) -from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage +from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( + FineGrainedLineage, + FineGrainedLineageDownstreamType, + FineGrainedLineageUpstreamType, + UpstreamLineage, +) from datahub.metadata.schema_classes import ( DatasetLineageTypeClass, UpstreamClass, UpstreamLineageClass, ) from datahub.utilities import memory_footprint +from datahub.utilities.urns import dataset_urn logger: logging.Logger = logging.getLogger(__name__) @@ -56,13 +64,14 @@ class LineageCollectorType(Enum): @dataclass(frozen=True, eq=True) class LineageDataset: platform: LineageDatasetPlatform - path: str + urn: str @dataclass() class LineageItem: dataset: LineageDataset upstreams: Set[LineageDataset] + cll: Optional[List[sqlglot_l.ColumnLineageInfo]] collector_type: LineageCollectorType dataset_lineage_type: str = field(init=False) @@ -83,10 +92,12 @@ def __init__( self, config: RedshiftConfig, report: RedshiftReport, + context: PipelineContext, redundant_run_skip_handler: Optional[RedundantLineageRunSkipHandler] = None, ): self.config = config self.report = report + self.context = context self._lineage_map: Dict[str, LineageItem] = defaultdict() self.redundant_run_skip_handler = redundant_run_skip_handler @@ -121,33 +132,37 @@ def _get_s3_path(self, path: str) -> str: return path - def _get_sources_from_query(self, db_name: str, query: str) -> List[LineageDataset]: + def _get_sources_from_query( + self, db_name: str, query: str + ) -> Tuple[List[LineageDataset], Optional[List[sqlglot_l.ColumnLineageInfo]]]: sources: List[LineageDataset] = list() - parser = LineageRunner(query) + parsed_result: Optional[ + sqlglot_l.SqlParsingResult + ] = sqlglot_l.create_lineage_sql_parsed_result( + query=query, + platform=LineageDatasetPlatform.REDSHIFT.value, + platform_instance=self.config.platform_instance, + database=db_name, + schema=str(self.config.default_schema), + graph=self.context.graph, + env=self.config.env, + ) - for table in parser.source_tables: - split = str(table).split(".") - if len(split) == 3: - db_name, source_schema, source_table = split - elif len(split) == 2: - source_schema, source_table = split - else: - raise ValueError( - f"Invalid table name {table} in query {query}. " - f"Expected format: [db_name].[schema].[table] or [schema].[table] or [table]." - ) + if parsed_result is None: + logger.debug(f"native query parsing failed for {query}") + return sources, None - if source_schema == "": - source_schema = str(self.config.default_schema) + logger.debug(f"parsed_result = {parsed_result}") + for table_urn in parsed_result.in_tables: source = LineageDataset( platform=LineageDatasetPlatform.REDSHIFT, - path=f"{db_name}.{source_schema}.{source_table}", + urn=table_urn, ) sources.append(source) - return sources + return sources, parsed_result.column_lineage def _build_s3_path_from_row(self, filename: str) -> str: path = filename.strip() @@ -165,9 +180,11 @@ def _get_sources( source_table: Optional[str], ddl: Optional[str], filename: Optional[str], - ) -> List[LineageDataset]: + ) -> Tuple[List[LineageDataset], Optional[List[sqlglot_l.ColumnLineageInfo]]]: sources: List[LineageDataset] = list() # Source + cll: Optional[List[sqlglot_l.ColumnLineageInfo]] = None + if ( lineage_type in { @@ -177,7 +194,7 @@ def _get_sources( and ddl is not None ): try: - sources = self._get_sources_from_query(db_name=db_name, query=ddl) + sources, cll = self._get_sources_from_query(db_name=db_name, query=ddl) except Exception as e: logger.warning( f"Error parsing query {ddl} for getting lineage. Error was {e}." @@ -192,22 +209,38 @@ def _get_sources( "Only s3 source supported with copy. The source was: {path}." ) self.report.num_lineage_dropped_not_support_copy_path += 1 - return sources + return sources, cll path = strip_s3_prefix(self._get_s3_path(path)) + urn = make_dataset_urn_with_platform_instance( + platform=platform.value, + name=path, + env=self.config.env, + platform_instance=self.config.platform_instance_map.get( + platform.value + ) + if self.config.platform_instance_map is not None + else None, + ) elif source_schema is not None and source_table is not None: platform = LineageDatasetPlatform.REDSHIFT path = f"{db_name}.{source_schema}.{source_table}" + urn = make_dataset_urn_with_platform_instance( + platform=platform.value, + platform_instance=self.config.platform_instance, + name=path, + env=self.config.env, + ) else: - return [] + return [], cll sources = [ LineageDataset( platform=platform, - path=path, + urn=urn, ) ] - return sources + return sources, cll def _populate_lineage_map( self, @@ -231,6 +264,7 @@ def _populate_lineage_map( :rtype: None """ try: + cll: Optional[List[sqlglot_l.ColumnLineageInfo]] = None raw_db_name = database alias_db_name = get_db_name(self.config) @@ -243,7 +277,7 @@ def _populate_lineage_map( if not target: continue - sources = self._get_sources( + sources, cll = self._get_sources( lineage_type, alias_db_name, source_schema=lineage_row.source_schema, @@ -251,6 +285,7 @@ def _populate_lineage_map( ddl=lineage_row.ddl, filename=lineage_row.filename, ) + target.cll = cll target.upstreams.update( self._get_upstream_lineages( @@ -262,20 +297,16 @@ def _populate_lineage_map( ) # Merging downstreams if dataset already exists and has downstreams - if target.dataset.path in self._lineage_map: - self._lineage_map[ - target.dataset.path - ].upstreams = self._lineage_map[ - target.dataset.path - ].upstreams.union( - target.upstreams - ) + if target.dataset.urn in self._lineage_map: + self._lineage_map[target.dataset.urn].upstreams = self._lineage_map[ + target.dataset.urn + ].upstreams.union(target.upstreams) else: - self._lineage_map[target.dataset.path] = target + self._lineage_map[target.dataset.urn] = target logger.debug( - f"Lineage[{target}]:{self._lineage_map[target.dataset.path]}" + f"Lineage[{target}]:{self._lineage_map[target.dataset.urn]}" ) except Exception as e: self.warn( @@ -308,17 +339,34 @@ def _get_target_lineage( target_platform = LineageDatasetPlatform.S3 # Following call requires 'filename' key in lineage_row target_path = self._build_s3_path_from_row(lineage_row.filename) + urn = make_dataset_urn_with_platform_instance( + platform=target_platform.value, + name=target_path, + env=self.config.env, + platform_instance=self.config.platform_instance_map.get( + target_platform.value + ) + if self.config.platform_instance_map is not None + else None, + ) except ValueError as e: self.warn(logger, "non-s3-lineage", str(e)) return None else: target_platform = LineageDatasetPlatform.REDSHIFT target_path = f"{alias_db_name}.{lineage_row.target_schema}.{lineage_row.target_table}" + urn = make_dataset_urn_with_platform_instance( + platform=target_platform.value, + platform_instance=self.config.platform_instance, + name=target_path, + env=self.config.env, + ) return LineageItem( - dataset=LineageDataset(platform=target_platform, path=target_path), + dataset=LineageDataset(platform=target_platform, urn=urn), upstreams=set(), collector_type=lineage_type, + cll=None, ) def _get_upstream_lineages( @@ -331,11 +379,22 @@ def _get_upstream_lineages( targe_source = [] for source in sources: if source.platform == LineageDatasetPlatform.REDSHIFT: - db, schema, table = source.path.split(".") + qualified_table_name = dataset_urn.DatasetUrn.create_from_string( + source.urn + ).get_entity_id()[1] + db, schema, table = qualified_table_name.split(".") if db == raw_db_name: db = alias_db_name path = f"{db}.{schema}.{table}" - source = LineageDataset(platform=source.platform, path=path) + source = LineageDataset( + platform=source.platform, + urn=make_dataset_urn_with_platform_instance( + platform=LineageDatasetPlatform.REDSHIFT.value, + platform_instance=self.config.platform_instance, + name=path, + env=self.config.env, + ), + ) # Filtering out tables which does not exist in Redshift # It was deleted in the meantime or query parser did not capture well the table name @@ -345,7 +404,7 @@ def _get_upstream_lineages( or not any(table == t.name for t in all_tables[db][schema]) ): logger.debug( - f"{source.path} missing table, dropping from lineage.", + f"{source.urn} missing table, dropping from lineage.", ) self.report.num_lineage_tables_dropped += 1 continue @@ -433,36 +492,73 @@ def populate_lineage( memory_footprint.total_size(self._lineage_map) ) + def make_fine_grained_lineage_class( + self, lineage_item: LineageItem, dataset_urn: str + ) -> List[FineGrainedLineage]: + fine_grained_lineages: List[FineGrainedLineage] = [] + + if ( + self.config.extract_column_level_lineage is False + or lineage_item.cll is None + ): + logger.debug("CLL extraction is disabled") + return fine_grained_lineages + + logger.debug("Extracting column level lineage") + + cll: List[sqlglot_l.ColumnLineageInfo] = lineage_item.cll + + for cll_info in cll: + downstream = ( + [builder.make_schema_field_urn(dataset_urn, cll_info.downstream.column)] + if cll_info.downstream is not None + and cll_info.downstream.column is not None + else [] + ) + + upstreams = [ + builder.make_schema_field_urn(column_ref.table, column_ref.column) + for column_ref in cll_info.upstreams + ] + + fine_grained_lineages.append( + FineGrainedLineage( + downstreamType=FineGrainedLineageDownstreamType.FIELD, + downstreams=downstream, + upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, + upstreams=upstreams, + ) + ) + + logger.debug(f"Created fine_grained_lineage for {dataset_urn}") + + return fine_grained_lineages + def get_lineage( self, table: Union[RedshiftTable, RedshiftView], dataset_urn: str, schema: RedshiftSchema, ) -> Optional[Tuple[UpstreamLineageClass, Dict[str, str]]]: - dataset_key = mce_builder.dataset_urn_to_key(dataset_urn) - if dataset_key is None: - return None upstream_lineage: List[UpstreamClass] = [] - if dataset_key.name in self._lineage_map: - item = self._lineage_map[dataset_key.name] + cll_lineage: List[FineGrainedLineage] = [] + + if dataset_urn in self._lineage_map: + item = self._lineage_map[dataset_urn] for upstream in item.upstreams: upstream_table = UpstreamClass( - dataset=make_dataset_urn_with_platform_instance( - upstream.platform.value, - upstream.path, - platform_instance=self.config.platform_instance_map.get( - upstream.platform.value - ) - if self.config.platform_instance_map - else None, - env=self.config.env, - ), + dataset=upstream.urn, type=item.dataset_lineage_type, ) upstream_lineage.append(upstream_table) + cll_lineage = self.make_fine_grained_lineage_class( + lineage_item=item, + dataset_urn=dataset_urn, + ) + tablename = table.name if table.type == "EXTERNAL_TABLE": # external_db_params = schema.option @@ -489,7 +585,12 @@ def get_lineage( else: return None - return UpstreamLineage(upstreams=upstream_lineage), {} + return ( + UpstreamLineage( + upstreams=upstream_lineage, fineGrainedLineages=cll_lineage or None + ), + {}, + ) def report_status(self, step: str, status: bool) -> None: if self.redundant_run_skip_handler: diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py index e983734082b1dc..771636e8498a30 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/profile.py @@ -1,33 +1,19 @@ -import dataclasses import logging -from datetime import datetime -from typing import Dict, Iterable, List, Optional, Union, cast +from typing import Dict, Iterable, List, Optional, Union -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.ge_data_profiler import GEProfilerRequest from datahub.ingestion.source.redshift.config import RedshiftConfig from datahub.ingestion.source.redshift.redshift_schema import ( RedshiftTable, RedshiftView, ) from datahub.ingestion.source.redshift.report import RedshiftReport -from datahub.ingestion.source.sql.sql_generic_profiler import ( - GenericProfiler, - TableProfilerRequest, -) +from datahub.ingestion.source.sql.sql_generic_profiler import GenericProfiler from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler logger = logging.getLogger(__name__) -@dataclasses.dataclass -class RedshiftProfilerRequest(GEProfilerRequest): - table: Union[RedshiftTable, RedshiftView] - profile_table_level_only: bool = False - - class RedshiftProfiler(GenericProfiler): config: RedshiftConfig report: RedshiftReport @@ -63,80 +49,21 @@ def get_workunits( continue for table in tables[db].get(schema, {}): # Emit the profile work unit - profile_request = self.get_redshift_profile_request( - table, schema, db - ) + profile_request = self.get_profile_request(table, schema, db) if profile_request is not None: + self.report.report_entity_profiled(profile_request.pretty_name) profile_requests.append(profile_request) if len(profile_requests) == 0: continue - table_profile_requests = cast(List[TableProfilerRequest], profile_requests) - for request, profile in self.generate_profiles( - table_profile_requests, + + yield from self.generate_profile_workunits( + profile_requests, self.config.profiling.max_workers, db, platform=self.platform, profiler_args=self.get_profile_args(), - ): - if profile is None: - continue - request = cast(RedshiftProfilerRequest, request) - - profile.sizeInBytes = request.table.size_in_bytes - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - - # We don't add to the profiler state if we only do table level profiling as it always happens - if self.state_handler and not request.profile_table_level_only: - self.state_handler.add_to_state( - dataset_urn, int(datetime.now().timestamp() * 1000) - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=profile - ).as_workunit() - - def get_redshift_profile_request( - self, - table: Union[RedshiftTable, RedshiftView], - schema_name: str, - db_name: str, - ) -> Optional[RedshiftProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = f"{db_name}.{schema_name}.{table.name}".lower() - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count - ): - # Profile only table level if dataset is filtered from profiling - # due to size limits alone - if self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, 0, 0 - ): - profile_table_level_only = True - else: - skip_profiling = True - - if len(table.columns) == 0: - skip_profiling = True - - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") - return None + ) - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = RedshiftProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict(schema=schema_name, table=table.name), - table=table, - profile_table_level_only=profile_table_level_only, - ) - return profile_request + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + return f"{db_name}.{schema_name}.{table_name}".lower() diff --git a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py index e8a8ff976afa6c..c7d01021773b12 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/redshift/redshift.py @@ -1,5 +1,6 @@ import logging from collections import defaultdict +from functools import partial from typing import Dict, Iterable, List, Optional, Type, Union import humanfriendly @@ -25,6 +26,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.source import ( CapabilityReport, MetadataWorkUnitProcessor, @@ -216,6 +218,9 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource): ] = { "BYTES": BytesType, "BOOL": BooleanType, + "BOOLEAN": BooleanType, + "DOUBLE": NumberType, + "DOUBLE PRECISION": NumberType, "DECIMAL": NumberType, "NUMERIC": NumberType, "BIGNUMERIC": NumberType, @@ -242,6 +247,13 @@ class RedshiftSource(StatefulIngestionSourceBase, TestableSource): "CHARACTER": StringType, "CHAR": StringType, "TIMESTAMP WITHOUT TIME ZONE": TimeType, + "REAL": NumberType, + "VARCHAR": StringType, + "TIMESTAMPTZ": TimeType, + "GEOMETRY": NullType, + "HLLSKETCH": NullType, + "TIMETZ": TimeType, + "VARBYTE": StringType, } def get_platform_instance_id(self) -> str: @@ -369,6 +381,11 @@ def gen_database_container(self, database: str) -> Iterable[MetadataWorkUnit]: def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), + partial( + auto_incremental_lineage, + self.ctx.graph, + self.config.incremental_lineage, + ), StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, @@ -881,6 +898,7 @@ def extract_lineage( self.lineage_extractor = RedshiftLineageExtractor( config=self.config, report=self.report, + context=self.ctx, redundant_run_skip_handler=self.redundant_lineage_run_skip_handler, ) @@ -941,7 +959,9 @@ def generate_lineage(self, database: str) -> Iterable[MetadataWorkUnit]: ) if lineage_info: yield from gen_lineage( - dataset_urn, lineage_info, self.config.incremental_lineage + dataset_urn, + lineage_info, + incremental_lineage=False, # incremental lineage generation is taken care by auto_incremental_lineage ) for schema in self.db_views[database]: @@ -955,7 +975,9 @@ def generate_lineage(self, database: str) -> Iterable[MetadataWorkUnit]: ) if lineage_info: yield from gen_lineage( - dataset_urn, lineage_info, self.config.incremental_lineage + dataset_urn, + lineage_info, + incremental_lineage=False, # incremental lineage generation is taken care by auto_incremental_lineage ) def add_config_to_report(self): diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py index f1dd622efb7468..3ef6476078f6fb 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/config.py @@ -5,8 +5,8 @@ from pydantic.fields import Field from datahub.configuration.common import AllowDenyPattern -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.source.aws.aws_common import AwsConnectionConfig from datahub.ingestion.source.data_lake_common.config import PathSpecsConfigMixin @@ -75,7 +75,10 @@ class DataLakeSourceConfig( default=100, description="Maximum number of rows to use when inferring schemas for TSV and CSV files.", ) - + add_partition_columns_to_schema: bool = Field( + default=False, + description="Whether to add partition fields to the schema.", + ) verify_ssl: Union[bool, str] = Field( default=True, description="Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index ac4433b7eb1f0c..94c571eabad11a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -10,6 +10,7 @@ from pathlib import PurePath from typing import Any, Dict, Iterable, List, Optional, Tuple +import smart_open.compression as so_compression from more_itertools import peekable from pyspark.conf import SparkConf from pyspark.sql import SparkSession @@ -77,6 +78,7 @@ NullTypeClass, NumberTypeClass, RecordTypeClass, + SchemaField, SchemaFieldDataType, SchemaMetadata, StringTypeClass, @@ -89,6 +91,7 @@ OperationClass, OperationTypeClass, OtherSchemaClass, + SchemaFieldDataTypeClass, _Aspect, ) from datahub.telemetry import stats, telemetry @@ -120,6 +123,9 @@ } PAGE_SIZE = 1000 +# Hack to support the .gzip extension with smart_open. +so_compression.register_compressor(".gzip", so_compression._COMPRESSOR_REGISTRY[".gz"]) + def get_column_type( report: SourceReport, dataset_name: str, column_type: str @@ -407,7 +413,9 @@ def get_fields(self, table_data: TableData, path_spec: PathSpec) -> List: table_data.full_path, "rb", transport_params={"client": s3_client} ) else: - file = open(table_data.full_path, "rb") + # We still use smart_open here to take advantage of the compression + # capabilities of smart_open. + file = smart_open(table_data.full_path, "rb") fields = [] @@ -452,8 +460,39 @@ def get_fields(self, table_data: TableData, path_spec: PathSpec) -> List: logger.debug(f"Extracted fields in schema: {fields}") fields = sorted(fields, key=lambda f: f.fieldPath) + if self.source_config.add_partition_columns_to_schema: + self.add_partition_columns_to_schema( + fields=fields, path_spec=path_spec, full_path=table_data.full_path + ) + return fields + def add_partition_columns_to_schema( + self, path_spec: PathSpec, full_path: str, fields: List[SchemaField] + ) -> None: + is_fieldpath_v2 = False + for field in fields: + if field.fieldPath.startswith("[version=2.0]"): + is_fieldpath_v2 = True + break + vars = path_spec.get_named_vars(full_path) + if vars is not None and "partition_key" in vars: + for partition_key in vars["partition_key"].values(): + fields.append( + SchemaField( + fieldPath=f"{partition_key}" + if not is_fieldpath_v2 + else f"[version=2.0].[type=string].{partition_key}", + nativeDataType="string", + type=SchemaFieldDataType(StringTypeClass()) + if not is_fieldpath_v2 + else SchemaFieldDataTypeClass(type=StringTypeClass()), + isPartitioningKey=True, + nullable=True, + recursive=False, + ) + ) + def get_table_profile( self, table_data: TableData, dataset_urn: str ) -> Iterable[MetadataWorkUnit]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py index 5797d66aa4d19e..b58bdf41ccaa5a 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py +++ b/metadata-ingestion/src/datahub/ingestion/source/schema_inference/object.py @@ -16,7 +16,7 @@ class SchemaDescription(BasicSchemaDescription): nullable: bool # if field is ever missing -def is_field_nullable(doc: Dict[str, Any], field_path: Tuple) -> bool: +def is_field_nullable(doc: Dict[str, Any], field_path: Tuple[str, ...]) -> bool: """ Check if a nested field is nullable in a document from a collection. @@ -54,7 +54,10 @@ def is_field_nullable(doc: Dict[str, Any], field_path: Tuple) -> bool: # count empty lists of nested objects as nullable if len(value) == 0: return True - return any(is_field_nullable(x, remaining_fields) for x in doc[field]) + return any( + isinstance(x, dict) and is_field_nullable(x, remaining_fields) + for x in doc[field] + ) # any other types to check? # raise ValueError("Nested type not 'list' or 'dict' encountered") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 95f64443844088..032bdef178fdf6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -101,8 +101,8 @@ class SnowflakeV2Config( ) include_view_column_lineage: bool = Field( - default=False, - description="Populates view->view and table->view column lineage.", + default=True, + description="Populates view->view and table->view column lineage using DataHub's sql parser.", ) _check_role_grants_removed = pydantic_removed_field("check_role_grants") diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py index 9a993f57740329..4219533dc217ca 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_v2.py @@ -20,12 +20,12 @@ import datahub.emitter.mce_builder as builder from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.sql_parsing_builder import SqlParsingBuilder from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.aws.s3_util import make_s3_urn +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.snowflake.constants import ( LINEAGE_PERMISSION_ERROR, SnowflakeEdition, - SnowflakeObjectDomain, ) from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery @@ -53,7 +53,6 @@ sqlglot_lineage, ) from datahub.utilities.time import ts_millis_to_datetime -from datahub.utilities.urns.dataset_urn import DatasetUrn logger: logging.Logger = logging.getLogger(__name__) @@ -136,7 +135,6 @@ def get_workunits( return self._populate_external_lineage_map(discovered_tables) - if self.config.include_view_lineage: if len(discovered_views) > 0: yield from self.get_view_upstream_workunits( @@ -196,19 +194,6 @@ def get_table_upstream_workunits( f"Upstream lineage detected for {self.report.num_tables_with_upstreams} tables.", ) - def _gen_workunit_from_sql_parsing_result( - self, - dataset_identifier: str, - result: SqlParsingResult, - ) -> MetadataWorkUnit: - upstreams, fine_upstreams = self.get_upstreams_from_sql_parsing_result( - self.dataset_urn_builder(dataset_identifier), result - ) - self.report.num_views_with_upstreams += 1 - return self._create_upstream_lineage_workunit( - dataset_identifier, upstreams, fine_upstreams - ) - def _gen_workunits_from_query_result( self, discovered_assets: Collection[str], @@ -242,18 +227,31 @@ def get_view_upstream_workunits( schema_resolver: SchemaResolver, view_definitions: MutableMapping[str, str], ) -> Iterable[MetadataWorkUnit]: - views_processed = set() + views_failed_parsing = set() if self.config.include_view_column_lineage: with PerfTimer() as timer: + builder = SqlParsingBuilder( + generate_lineage=True, + generate_usage_statistics=False, + generate_operations=False, + ) for view_identifier, view_definition in view_definitions.items(): result = self._run_sql_parser( view_identifier, view_definition, schema_resolver ) - if result: - views_processed.add(view_identifier) - yield self._gen_workunit_from_sql_parsing_result( - view_identifier, result + if result and result.out_tables: + self.report.num_views_with_upstreams += 1 + # This does not yield any workunits but we use + # yield here to execute this method + yield from builder.process_sql_parsing_result( + result=result, + query=view_definition, + is_view_ddl=True, ) + else: + views_failed_parsing.add(view_identifier) + + yield from builder.gen_workunits() self.report.view_lineage_parse_secs = timer.elapsed_seconds() with PerfTimer() as timer: @@ -261,7 +259,7 @@ def get_view_upstream_workunits( if results: yield from self._gen_workunits_from_query_result( - set(discovered_views) - views_processed, + views_failed_parsing, results, upstream_for_view=True, ) @@ -349,39 +347,6 @@ def get_upstreams_from_query_result_row( return upstreams, fine_upstreams - def get_upstreams_from_sql_parsing_result( - self, downstream_table_urn: str, result: SqlParsingResult - ) -> Tuple[List[UpstreamClass], List[FineGrainedLineage]]: - # Note: This ignores the out_tables section of the sql parsing result. - upstreams = [ - UpstreamClass(dataset=upstream_table_urn, type=DatasetLineageTypeClass.VIEW) - for upstream_table_urn in set(result.in_tables) - ] - - # Maps downstream_col -> [upstream_col] - fine_lineage: Dict[str, Set[SnowflakeColumnId]] = defaultdict(set) - for column_lineage in result.column_lineage or []: - out_column = column_lineage.downstream.column - for upstream_column_info in column_lineage.upstreams: - upstream_table_name = DatasetUrn.create_from_string( - upstream_column_info.table - ).get_dataset_name() - fine_lineage[out_column].add( - SnowflakeColumnId( - columnName=upstream_column_info.column, - objectName=upstream_table_name, - objectDomain=SnowflakeObjectDomain.VIEW.value, - ) - ) - fine_upstreams = [ - self.build_finegrained_lineage( - downstream_table_urn, downstream_col, upstream_cols - ) - for downstream_col, upstream_cols in fine_lineage.items() - ] - - return upstreams, list(filter(None, fine_upstreams)) - def _populate_external_lineage_map(self, discovered_tables: List[str]) -> None: with PerfTimer() as timer: self.report.num_external_table_edges_scanned = 0 @@ -652,7 +617,9 @@ def get_external_upstreams(self, external_lineage: Set[str]) -> List[UpstreamCla # For now, populate only for S3 if external_lineage_entry.startswith("s3://"): external_upstream_table = UpstreamClass( - dataset=make_s3_urn(external_lineage_entry, self.config.env), + dataset=make_s3_urn_for_lineage( + external_lineage_entry, self.config.env + ), type=DatasetLineageTypeClass.COPY, ) external_upstreams.append(external_upstream_table) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py index 5f5e8e4bcdea38..8e18d85d6f3ca3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_profiler.py @@ -1,20 +1,12 @@ -import dataclasses import logging -from datetime import datetime -from typing import Callable, Dict, Iterable, List, Optional, cast +from typing import Callable, Dict, Iterable, List, Optional from snowflake.sqlalchemy import snowdialect from sqlalchemy import create_engine, inspect from sqlalchemy.sql import sqltypes -from datahub.configuration.pattern_utils import is_schema_allowed -from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance -from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.ge_data_profiler import ( - DatahubGEProfiler, - GEProfilerRequest, -) +from datahub.ingestion.source.ge_data_profiler import DatahubGEProfiler from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report @@ -23,10 +15,8 @@ SnowflakeTable, ) from datahub.ingestion.source.snowflake.snowflake_utils import SnowflakeCommonMixin -from datahub.ingestion.source.sql.sql_generic_profiler import ( - GenericProfiler, - TableProfilerRequest, -) +from datahub.ingestion.source.sql.sql_generic import BaseTable +from datahub.ingestion.source.sql.sql_generic_profiler import GenericProfiler from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler snowdialect.ischema_names["GEOGRAPHY"] = sqltypes.NullType @@ -35,12 +25,6 @@ logger = logging.getLogger(__name__) -@dataclasses.dataclass -class SnowflakeProfilerRequest(GEProfilerRequest): - table: SnowflakeTable - profile_table_level_only: bool = False - - class SnowflakeProfiler(GenericProfiler, SnowflakeCommonMixin): def __init__( self, @@ -65,101 +49,52 @@ def get_workunits( profile_requests = [] for schema in database.schemas: - if not is_schema_allowed( - self.config.schema_pattern, - schema.name, - database.name, - self.config.match_fully_qualified_names, - ): - continue - for table in db_tables[schema.name]: - profile_request = self.get_snowflake_profile_request( + profile_request = self.get_profile_request( table, schema.name, database.name ) if profile_request is not None: + self.report.report_entity_profiled(profile_request.pretty_name) profile_requests.append(profile_request) if len(profile_requests) == 0: return - table_profile_requests = cast(List[TableProfilerRequest], profile_requests) - - for request, profile in self.generate_profiles( - table_profile_requests, + yield from self.generate_profile_workunits( + profile_requests, self.config.profiling.max_workers, database.name, platform=self.platform, profiler_args=self.get_profile_args(), - ): - if profile is None: - continue - profile.sizeInBytes = cast( - SnowflakeProfilerRequest, request - ).table.size_in_bytes - dataset_name = request.pretty_name - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) - - # We don't add to the profiler state if we only do table level profiling as it always happens - if self.state_handler: - self.state_handler.add_to_state( - dataset_urn, int(datetime.now().timestamp() * 1000) - ) - - yield MetadataChangeProposalWrapper( - entityUrn=dataset_urn, aspect=profile - ).as_workunit() + ) - def get_snowflake_profile_request( - self, - table: SnowflakeTable, - schema_name: str, - db_name: str, - ) -> Optional[SnowflakeProfilerRequest]: - skip_profiling = False - profile_table_level_only = self.config.profiling.profile_table_level_only - dataset_name = self.get_dataset_identifier(table.name, schema_name, db_name) - if not self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, table.size_in_bytes, table.rows_count + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + return self.get_dataset_identifier(table_name, schema_name, db_name) + + def get_batch_kwargs( + self, table: BaseTable, schema_name: str, db_name: str + ) -> dict: + custom_sql = None + if ( + not self.config.profiling.limit + and self.config.profiling.use_sampling + and table.rows_count + and table.rows_count > self.config.profiling.sample_size ): - # Profile only table level if dataset is filtered from profiling - # due to size limits alone - if self.is_dataset_eligible_for_profiling( - dataset_name, table.last_altered, 0, 0 - ): - profile_table_level_only = True - else: - skip_profiling = True - - if len(table.columns) == 0: - skip_profiling = True - - if skip_profiling: - if self.config.profiling.report_dropped_profiles: - self.report.report_dropped(f"profile of {dataset_name}") - return None - - self.report.report_entity_profiled(dataset_name) - logger.debug(f"Preparing profiling request for {dataset_name}") - profile_request = SnowflakeProfilerRequest( - pretty_name=dataset_name, - batch_kwargs=dict( - schema=schema_name, - table=table.name, - # Lowercase/Mixedcase table names in Snowflake do not work by default. - # We need to pass `use_quoted_name=True` for such tables as mentioned here - - # https://github.com/great-expectations/great_expectations/pull/2023 - use_quoted_name=(table.name != table.name.upper()), - ), - table=table, - profile_table_level_only=profile_table_level_only, - ) - return profile_request + # GX creates a temporary table from query if query is passed as batch kwargs. + # We are using fraction-based sampling here, instead of fixed-size sampling because + # Fixed-size sampling can be slower than equivalent fraction-based sampling + # as per https://docs.snowflake.com/en/sql-reference/constructs/sample#performance-considerations + sample_pc = 100 * self.config.profiling.sample_size / table.rows_count + custom_sql = f'select * from "{db_name}"."{schema_name}"."{table.name}" TABLESAMPLE ({sample_pc:.8f})' + return { + **super().get_batch_kwargs(table, schema_name, db_name), + # Lowercase/Mixedcase table names in Snowflake do not work by default. + # We need to pass `use_quoted_name=True` for such tables as mentioned here - + # https://github.com/great-expectations/great_expectations/pull/2023 + "use_quoted_name": (table.name != table.name.upper()), + "custom_sql": custom_sql, + } def get_profiler_instance( self, db_name: Optional[str] = None diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py index 240e0ffa1a0b6d..a5c07d9a3870c6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py @@ -4,6 +4,7 @@ import os.path import platform from dataclasses import dataclass +from functools import partial from typing import Callable, Dict, Iterable, List, Optional, Union import pandas as pd @@ -26,6 +27,7 @@ platform_name, support_status, ) +from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.source import ( CapabilityReport, MetadataWorkUnitProcessor, @@ -301,14 +303,11 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config): # Caches tables for a single database. Consider moving to disk or S3 when possible. self.db_tables: Dict[str, List[SnowflakeTable]] = {} - self.sql_parser_schema_resolver = SchemaResolver( - platform=self.platform, - platform_instance=self.config.platform_instance, - env=self.config.env, - ) self.view_definitions: FileBackedDict[str] = FileBackedDict() self.add_config_to_report() + self.sql_parser_schema_resolver = self._init_schema_resolver() + @classmethod def create(cls, config_dict: dict, ctx: PipelineContext) -> "Source": config = SnowflakeV2Config.parse_obj(config_dict) @@ -493,9 +492,32 @@ def query(query): return _report + def _init_schema_resolver(self) -> SchemaResolver: + if not self.config.include_technical_schema and self.config.parse_view_ddl: + if self.ctx.graph: + return self.ctx.graph.initialize_schema_resolver_from_datahub( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + else: + logger.warning( + "Failed to load schema info from DataHub as DataHubGraph is missing.", + ) + return SchemaResolver( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), + partial( + auto_incremental_lineage, + self.ctx.graph, + self.config.incremental_lineage, + ), StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, @@ -764,7 +786,7 @@ def _process_schema( ) self.db_tables[schema_name] = tables - if self.config.include_technical_schema or self.config.parse_view_ddl: + if self.config.include_technical_schema: for table in tables: yield from self._process_table(table, schema_name, db_name) @@ -776,7 +798,7 @@ def _process_schema( if view.view_definition: self.view_definitions[key] = view.view_definition - if self.config.include_technical_schema or self.config.parse_view_ddl: + if self.config.include_technical_schema: for view in views: yield from self._process_view(view, schema_name, db_name) @@ -892,8 +914,6 @@ def _process_table( yield from self._process_tag(tag) yield from self.gen_dataset_workunits(table, schema_name, db_name) - elif self.config.parse_view_ddl: - self.gen_schema_metadata(table, schema_name, db_name) def fetch_sample_data_for_classification( self, table: SnowflakeTable, schema_name: str, db_name: str, dataset_name: str @@ -1004,8 +1024,6 @@ def _process_view( yield from self._process_tag(tag) yield from self.gen_dataset_workunits(view, schema_name, db_name) - elif self.config.parse_view_ddl: - self.gen_schema_metadata(view, schema_name, db_name) def _process_tag(self, tag: SnowflakeTag) -> Iterable[MetadataWorkUnit]: tag_identifier = tag.identifier() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index 9cb613bde1e9f8..75e8fe1d6f7a6f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -1,12 +1,17 @@ import json import logging +import re import typing -from typing import Any, Dict, Iterable, List, Optional, Tuple, cast +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union, cast import pydantic from pyathena.common import BaseCursor from pyathena.model import AthenaTableMetadata +from pyathena.sqlalchemy_athena import AthenaRestDialect +from sqlalchemy import create_engine, inspect, types from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.types import TypeEngine +from sqlalchemy_bigquery import STRUCT from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.emitter.mcp_builder import ContainerKey, DatabaseKey @@ -21,13 +26,166 @@ from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.aws.s3_util import make_s3_urn from datahub.ingestion.source.common.subtypes import DatasetContainerSubTypes -from datahub.ingestion.source.sql.sql_common import SQLAlchemySource +from datahub.ingestion.source.sql.sql_common import ( + SQLAlchemySource, + register_custom_type, +) from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri from datahub.ingestion.source.sql.sql_utils import ( add_table_to_schema_container, gen_database_container, gen_database_key, ) +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField +from datahub.metadata.schema_classes import MapTypeClass, RecordTypeClass +from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column +from datahub.utilities.sqlalchemy_type_converter import ( + MapType, + get_schema_fields_for_sqlalchemy_column, +) + +logger = logging.getLogger(__name__) + +assert STRUCT, "required type modules are not available" +register_custom_type(STRUCT, RecordTypeClass) +register_custom_type(MapType, MapTypeClass) + + +class CustomAthenaRestDialect(AthenaRestDialect): + """Custom definition of the Athena dialect. + + Custom implementation that allows to extend/modify the behavior of the SQLalchemy + dialect that is used by PyAthena (which is the library that is used by DataHub + to extract metadata from Athena). + This dialect can then be used by the inspector (see get_inspectors()). + + """ + + # regex to identify complex types in DDL strings which are embedded in `<>`. + _complex_type_pattern = re.compile(r"(<.+>)") + + @typing.no_type_check + def _get_column_type( + self, type_: Union[str, Dict[str, Any]] + ) -> TypeEngine: # noqa: C901 + """Derives the data type of the Athena column. + + This method is overwritten to extend the behavior of PyAthena. + Pyathena is not capable of detecting complex data types, e.g., + arrays, maps, or, structs (as of version 2.25.2). + The custom implementation extends the functionality by the above-mentioned data types. + """ + + # Originally, this method only handles `type_` as a string + # With the workaround used below to parse DDL strings for structs, + # `type` might also be a dictionary + if isinstance(type_, str): + match = self._pattern_column_type.match(type_) + if match: + type_name = match.group(1).lower() + type_meta_information = match.group(2) + else: + type_name = type_.lower() + type_meta_information = None + elif isinstance(type_, dict): + # this occurs only when a type parsed as part of a STRUCT is passed + # in such case type_ is a dictionary whose type can be retrieved from the attribute + type_name = type_.get("type", None) + type_meta_information = None + else: + raise RuntimeError(f"Unsupported type definition: {type_}") + + args = [] + + if type_name in ["array"]: + detected_col_type = types.ARRAY + + # here we need to account again for two options how `type_` is passed to this method + # first, the simple array definition as a DDL string (something like array) + # this is always the case when the array is not part of a complex data type (mainly STRUCT) + # second, the array definition can also be passed in form of dictionary + # this is the case when the array is part of a complex data type + if isinstance(type_, str): + # retrieve the raw name of the data type as a string + array_type_raw = self._complex_type_pattern.findall(type_)[0][ + 1:-1 + ] # array type without enclosing <> + # convert the string name of the data type into a SQLalchemy type (expected return) + array_type = self._get_column_type(array_type_raw) + elif isinstance(type_, dict): + # retrieve the data type of the array items and + # transform it into a SQLalchemy type + array_type = self._get_column_type(type_["items"]) + else: + raise RuntimeError(f"Unsupported array definition: {type_}") + + args = [array_type] + + elif type_name in ["struct", "record"]: + # STRUCT is not part of the SQLalchemy types selection + # but is provided by another official SQLalchemy library and + # compatible with the other SQLalchemy types + detected_col_type = STRUCT + + if isinstance(type_, dict): + # in case a struct as part of another struct is passed + # it is provided in form of a dictionary and + # can simply be used for the further processing + struct_type = type_ + else: + # this is the case when the type definition of the struct is passed as a DDL string + # therefore, it is required to parse the DDL string + # here a method provided in another Datahub source is used so that the parsing + # doesn't need to be implemented twice + # `get_avro_schema_for_hive_column` accepts a DDL description as column type and + # returns the parsed data types in form of a dictionary + schema = get_avro_schema_for_hive_column( + hive_column_name=type_name, hive_column_type=type_ + ) + + # the actual type description needs to be extracted + struct_type = schema["fields"][0]["type"] + + # A STRUCT consist of multiple attributes which are expected to be passed as + # a list of tuples consisting of name data type pairs. e.g., `('age', Integer())` + # See the reference: + # https://github.com/googleapis/python-bigquery-sqlalchemy/blob/main/sqlalchemy_bigquery/_struct.py#L53 + # + # To extract all of them, we simply iterate over all detected fields and + # convert them to SQLalchemy types + struct_args = [] + for field in struct_type["fields"]: + struct_args.append( + ( + field["name"], + self._get_column_type(field["type"]["type"]) + if field["type"]["type"] not in ["record", "array"] + else self._get_column_type(field["type"]), + ) + ) + + args = struct_args + + elif type_name in ["map"]: + # Instead of SQLalchemy's TupleType the custom MapType is used here + # which is just a simple wrapper around TupleType + detected_col_type = MapType + + # the type definition for maps looks like the following: key_type:val_type (e.g., string:string) + key_type_raw, value_type_raw = type_meta_information.split(",") + + # convert both type names to actual SQLalchemy types + args = [ + self._get_column_type(key_type_raw), + self._get_column_type(value_type_raw), + ] + # by using get_avro_schema_for_hive_column() for parsing STRUCTs the data type `long` + # can also be returned, so we need to extend the handling here as well + elif type_name in ["bigint", "long"]: + detected_col_type = types.BIGINT + else: + return super()._get_column_type(type_name) + return detected_col_type(*args) class AthenaConfig(SQLCommonConfig): @@ -129,6 +287,18 @@ def create(cls, config_dict, ctx): config = AthenaConfig.parse_obj(config_dict) return cls(config, ctx) + # overwrite this method to allow to specify the usage of a custom dialect + def get_inspectors(self) -> Iterable[Inspector]: + url = self.config.get_sql_alchemy_url() + logger.debug(f"sql_alchemy_url={url}") + engine = create_engine(url, **self.config.options) + + # set custom dialect to be used by the inspector + engine.dialect = CustomAthenaRestDialect() + with engine.connect() as conn: + inspector = inspect(conn) + yield inspector + def get_table_properties( self, inspector: Inspector, schema: str, table: str ) -> Tuple[Optional[str], Dict[str, str], Optional[str]]: @@ -136,9 +306,7 @@ def get_table_properties( self.cursor = cast(BaseCursor, inspector.engine.raw_connection().cursor()) assert self.cursor - # Unfortunately properties can be only get through private methods as those are not exposed - # https://github.com/laughingman7743/PyAthena/blob/9e42752b0cc7145a87c3a743bb2634fe125adfa7/pyathena/model.py#L201 - metadata: AthenaTableMetadata = self.cursor._get_table_metadata( + metadata: AthenaTableMetadata = self.cursor.get_table_metadata( table_name=table, schema_name=schema ) description = metadata.comment @@ -241,6 +409,30 @@ def get_schema_names(self, inspector: Inspector) -> List[str]: return [schema for schema in schemas if schema == athena_config.database] return schemas + # Overwrite to modify the creation of schema fields + def get_schema_fields_for_column( + self, + dataset_name: str, + column: Dict, + pk_constraints: Optional[dict] = None, + tags: Optional[List[str]] = None, + ) -> List[SchemaField]: + fields = get_schema_fields_for_sqlalchemy_column( + column_name=column["name"], + column_type=column["type"], + description=column.get("comment", None), + nullable=column.get("nullable", True), + is_part_of_key=True + if ( + pk_constraints is not None + and isinstance(pk_constraints, dict) + and column["name"] in pk_constraints.get("constrained_columns", []) + ) + else False, + ) + + return fields + def close(self): if self.cursor: self.cursor.close() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py index 1626f86b92545c..8873038079bada 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/clickhouse.py @@ -19,9 +19,9 @@ from sqlalchemy.types import BOOLEAN, DATE, DATETIME, INTEGER import datahub.emitter.mce_builder as builder -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import DatasetLineageProviderConfigBase from datahub.configuration.time_window_config import BaseTimeWindowConfig +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.emitter import mce_builder from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.decorators import ( diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py index 63b21bc82edddd..d081acb6c1effa 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/hive.py @@ -1,15 +1,18 @@ import json import logging import re -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Union from pydantic.class_validators import validator from pydantic.fields import Field # This import verifies that the dependencies are available. from pyhive import hive # noqa: F401 -from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveTimestamp +from pyhive.sqlalchemy_hive import HiveDate, HiveDecimal, HiveDialect, HiveTimestamp +from sqlalchemy.engine.reflection import Inspector +from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.decorators import ( SourceCapability, SupportStatus, @@ -18,8 +21,10 @@ platform_name, support_status, ) +from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.extractor import schema_util -from datahub.ingestion.source.sql.sql_common import register_custom_type +from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig from datahub.ingestion.source.sql.two_tier_sql_source import ( TwoTierSQLAlchemyConfig, TwoTierSQLAlchemySource, @@ -31,6 +36,7 @@ SchemaField, TimeTypeClass, ) +from datahub.metadata.schema_classes import ViewPropertiesClass from datahub.utilities import config_clean from datahub.utilities.hive_schema_to_avro import get_avro_schema_for_hive_column @@ -90,19 +96,34 @@ def dbapi_get_columns_patched(self, connection, table_name, schema=None, **kw): logger.warning(f"Failed to patch method due to {e}") +@reflection.cache # type: ignore +def get_view_names_patched(self, connection, schema=None, **kw): + query = "SHOW VIEWS" + if schema: + query += " IN " + self.identifier_preparer.quote_identifier(schema) + return [row[0] for row in connection.execute(query)] + + +@reflection.cache # type: ignore +def get_view_definition_patched(self, connection, view_name, schema=None, **kw): + full_table = self.identifier_preparer.quote_identifier(view_name) + if schema: + full_table = "{}.{}".format( + self.identifier_preparer.quote_identifier(schema), + self.identifier_preparer.quote_identifier(view_name), + ) + row = connection.execute("SHOW CREATE TABLE {}".format(full_table)).fetchone() + return row[0] + + +HiveDialect.get_view_names = get_view_names_patched +HiveDialect.get_view_definition = get_view_definition_patched + + class HiveConfig(TwoTierSQLAlchemyConfig): # defaults scheme = Field(default="hive", hidden_from_docs=True) - # Hive SQLAlchemy connector returns views as tables. - # See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. - # Disabling views helps us prevent this duplication. - include_views = Field( - default=False, - hidden_from_docs=True, - description="Hive SQLAlchemy connector returns views as tables. See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. Disabling views helps us prevent this duplication.", - ) - @validator("host_port") def clean_host_port(cls, v): return config_clean.remove_protocol(v) @@ -174,3 +195,41 @@ def get_schema_fields_for_column( return new_fields return fields + + # Hive SQLAlchemy connector returns views as tables in get_table_names. + # See https://github.com/dropbox/PyHive/blob/b21c507a24ed2f2b0cf15b0b6abb1c43f31d3ee0/pyhive/sqlalchemy_hive.py#L270-L273. + # This override makes sure that we ingest view definitions for views + def _process_view( + self, + dataset_name: str, + inspector: Inspector, + schema: str, + view: str, + sql_config: SQLCommonConfig, + ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: + dataset_urn = make_dataset_urn_with_platform_instance( + self.platform, + dataset_name, + self.config.platform_instance, + self.config.env, + ) + + try: + view_definition = inspector.get_view_definition(view, schema) + if view_definition is None: + view_definition = "" + else: + # Some dialects return a TextClause instead of a raw string, + # so we need to convert them to a string. + view_definition = str(view_definition) + except NotImplementedError: + view_definition = "" + + if view_definition: + view_properties_aspect = ViewPropertiesClass( + materialized=False, viewLanguage="SQL", viewLogic=view_definition + ) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=view_properties_aspect, + ).as_workunit() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py index 685d4fb3074c92..710825c8ba55da 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/mssql/source.py @@ -530,7 +530,7 @@ def _get_procedure_inputs( def _get_procedure_code( conn: Connection, procedure: StoredProcedure ) -> Tuple[Optional[str], Optional[str]]: - query = f"EXEC [{procedure.db}].dbo.sp_helptext '{procedure.full_name}'" + query = f"EXEC [{procedure.db}].dbo.sp_helptext '{procedure.escape_full_name}'" try: code_data = conn.execute(query) except ProgrammingError: @@ -567,7 +567,7 @@ def _get_procedure_properties( create_date as date_created, modify_date as date_modified FROM sys.procedures - WHERE object_id = object_id('{procedure.full_name}') + WHERE object_id = object_id('{procedure.escape_full_name}') """ ) properties = {} diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py index ba8655b83446d6..4f133c6459a0ff 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/postgres.py @@ -103,10 +103,6 @@ class BasePostgresConfig(BasicSQLAlchemyConfig): class PostgresConfig(BasePostgresConfig): - include_view_lineage = Field( - default=False, description="Include table lineage for views" - ) - database_pattern: AllowDenyPattern = Field( default=AllowDenyPattern.allow_all(), description=( @@ -183,9 +179,10 @@ def get_inspectors(self) -> Iterable[Inspector]: def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: yield from super().get_workunits_internal() - for inspector in self.get_inspectors(): - if self.config.include_view_lineage: - yield from self._get_view_lineage_workunits(inspector) + if self.views_failed_parsing: + for inspector in self.get_inspectors(): + if self.config.include_view_lineage: + yield from self._get_view_lineage_workunits(inspector) def _get_view_lineage_elements( self, inspector: Inspector @@ -217,14 +214,15 @@ def _get_view_lineage_elements( key = (lineage.dependent_view, lineage.dependent_schema) # Append the source table to the list. lineage_elements[key].append( - mce_builder.make_dataset_urn( - self.platform, - self.get_identifier( + mce_builder.make_dataset_urn_with_platform_instance( + platform=self.platform, + name=self.get_identifier( schema=lineage.source_schema, entity=lineage.source_table, inspector=inspector, ), - self.config.env, + platform_instance=self.config.platform_instance, + env=self.config.env, ) ) @@ -244,12 +242,16 @@ def _get_view_lineage_workunits( dependent_view, dependent_schema = key # Construct a lineage object. - urn = mce_builder.make_dataset_urn( - self.platform, - self.get_identifier( - schema=dependent_schema, entity=dependent_view, inspector=inspector - ), - self.config.env, + view_identifier = self.get_identifier( + schema=dependent_schema, entity=dependent_view, inspector=inspector + ) + if view_identifier not in self.views_failed_parsing: + return + urn = mce_builder.make_dataset_urn_with_platform_instance( + platform=self.platform, + name=view_identifier, + platform_instance=self.config.platform_instance, + env=self.config.env, ) # use the mce_builder to ensure that the change proposal inherits diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py index 112defe76d9571..80f828e9ea2fd1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_common.py @@ -1,15 +1,15 @@ import datetime import logging import traceback -from collections import OrderedDict from dataclasses import dataclass, field +from functools import partial from typing import ( TYPE_CHECKING, Any, - Callable, Dict, Iterable, List, + MutableMapping, Optional, Set, Tuple, @@ -20,6 +20,7 @@ import sqlalchemy.dialects.postgresql.base from sqlalchemy import create_engine, inspect from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.engine.row import LegacyRow from sqlalchemy.exc import ProgrammingError from sqlalchemy.sql import sqltypes as types from sqlalchemy.types import TypeDecorator, TypeEngine @@ -31,7 +32,9 @@ make_tag_urn, ) from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.emitter.sql_parsing_builder import SqlParsingBuilder from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage from datahub.ingestion.api.source import MetadataWorkUnitProcessor from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.common.subtypes import ( @@ -88,9 +91,16 @@ ViewPropertiesClass, ) from datahub.telemetry import telemetry +from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.lossy_collections import LossyList from datahub.utilities.registries.domain_registry import DomainRegistry from datahub.utilities.sqlalchemy_query_combiner import SQLAlchemyQueryCombinerReport +from datahub.utilities.sqlglot_lineage import ( + SchemaResolver, + SqlParsingResult, + sqlglot_lineage, + view_definition_lineage_helper, +) if TYPE_CHECKING: from datahub.ingestion.source.ge_data_profiler import ( @@ -103,52 +113,6 @@ MISSING_COLUMN_INFO = "missing column information" -def _platform_alchemy_uri_tester_gen( - platform: str, opt_starts_with: Optional[str] = None -) -> Tuple[str, Callable[[str], bool]]: - return platform, lambda x: x.startswith( - platform if not opt_starts_with else opt_starts_with - ) - - -PLATFORM_TO_SQLALCHEMY_URI_TESTER_MAP: Dict[str, Callable[[str], bool]] = OrderedDict( - [ - _platform_alchemy_uri_tester_gen("athena", "awsathena"), - _platform_alchemy_uri_tester_gen("bigquery"), - _platform_alchemy_uri_tester_gen("clickhouse"), - _platform_alchemy_uri_tester_gen("druid"), - _platform_alchemy_uri_tester_gen("hana"), - _platform_alchemy_uri_tester_gen("hive"), - _platform_alchemy_uri_tester_gen("mongodb"), - _platform_alchemy_uri_tester_gen("mssql"), - _platform_alchemy_uri_tester_gen("mysql"), - _platform_alchemy_uri_tester_gen("oracle"), - _platform_alchemy_uri_tester_gen("pinot"), - _platform_alchemy_uri_tester_gen("presto"), - ( - "redshift", - lambda x: ( - x.startswith(("jdbc:postgres:", "postgresql")) - and x.find("redshift.amazonaws") > 0 - ) - or x.startswith("redshift"), - ), - # Don't move this before redshift. - _platform_alchemy_uri_tester_gen("postgres", "postgresql"), - _platform_alchemy_uri_tester_gen("snowflake"), - _platform_alchemy_uri_tester_gen("trino"), - _platform_alchemy_uri_tester_gen("vertica"), - ] -) - - -def get_platform_from_sqlalchemy_uri(sqlalchemy_uri: str) -> str: - for platform, tester in PLATFORM_TO_SQLALCHEMY_URI_TESTER_MAP.items(): - if tester(sqlalchemy_uri): - return platform - return "external" - - @dataclass class SQLSourceReport(StaleEntityRemovalSourceReport): tables_scanned: int = 0 @@ -158,6 +122,11 @@ class SQLSourceReport(StaleEntityRemovalSourceReport): query_combiner: Optional[SQLAlchemyQueryCombinerReport] = None + num_view_definitions_parsed: int = 0 + num_view_definitions_failed_parsing: int = 0 + num_view_definitions_failed_column_parsing: int = 0 + view_definitions_parsing_failures: LossyList[str] = field(default_factory=LossyList) + def report_entity_scanned(self, name: str, ent_type: str = "table") -> None: """ Entity could be a view or a table @@ -186,6 +155,7 @@ class SqlWorkUnit(MetadataWorkUnit): _field_type_mapping: Dict[Type[TypeEngine], Type] = { + # Note: to add dialect-specific types to this mapping, use the `register_custom_type` function. types.Integer: NumberTypeClass, types.Numeric: NumberTypeClass, types.Boolean: BooleanTypeClass, @@ -366,6 +336,18 @@ def __init__(self, config: SQLCommonConfig, ctx: PipelineContext, platform: str) cached_domains=[k for k in self.config.domain], graph=self.ctx.graph ) + self.views_failed_parsing: Set[str] = set() + self.schema_resolver: SchemaResolver = SchemaResolver( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + ) + self._view_definition_cache: MutableMapping[str, str] + if self.config.use_file_backed_cache: + self._view_definition_cache = FileBackedDict[str]() + else: + self._view_definition_cache = {} + def warn(self, log: logging.Logger, key: str, reason: str) -> None: self.report.report_warning(key, reason[:100]) log.warning(f"{key} => {reason}") @@ -502,6 +484,11 @@ def get_schema_level_workunits( def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: return [ *super().get_workunit_processors(), + partial( + auto_incremental_lineage, + self.ctx.graph, + self.config.incremental_lineage, + ), StaleEntityRemovalHandler.create( self, self.config, self.ctx ).workunit_processor, @@ -559,6 +546,35 @@ def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit profile_requests, profiler, platform=self.platform ) + if self.config.include_view_lineage: + yield from self.get_view_lineage() + + def get_view_lineage(self) -> Iterable[MetadataWorkUnit]: + builder = SqlParsingBuilder( + generate_lineage=True, + generate_usage_statistics=False, + generate_operations=False, + ) + for dataset_name in self._view_definition_cache.keys(): + view_definition = self._view_definition_cache[dataset_name] + result = self._run_sql_parser( + dataset_name, + view_definition, + self.schema_resolver, + ) + if result and result.out_tables: + # This does not yield any workunits but we use + # yield here to execute this method + yield from builder.process_sql_parsing_result( + result=result, + query=view_definition, + is_view_ddl=True, + include_column_lineage=self.config.include_view_column_lineage, + ) + else: + self.views_failed_parsing.add(dataset_name) + yield from builder.gen_workunits() + def get_identifier( self, *, schema: str, entity: str, inspector: Inspector, **kwargs: Any ) -> str: @@ -705,6 +721,8 @@ def _process_table( schema_fields, ) dataset_snapshot.aspects.append(schema_metadata) + if self.config.include_view_lineage: + self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) db_name = self.get_db_name(inspector) yield from self.add_table_to_schema_container( @@ -767,7 +785,7 @@ def get_table_properties( table_info: dict = inspector.get_table_comment(table, f'"{schema}"') # type: ignore description = table_info.get("text") - if type(description) is tuple: + if isinstance(description, LegacyRow): # Handling for value type tuple which is coming for dialect 'db2+ibm_db' description = table_info["text"][0] @@ -909,6 +927,12 @@ def _process_view( view: str, sql_config: SQLCommonConfig, ) -> Iterable[Union[SqlWorkUnit, MetadataWorkUnit]]: + dataset_urn = make_dataset_urn_with_platform_instance( + self.platform, + dataset_name, + self.config.platform_instance, + self.config.env, + ) try: columns = inspector.get_columns(view, schema) except KeyError: @@ -924,6 +948,8 @@ def _process_view( columns, canonical_schema=schema_fields, ) + if self.config.include_view_lineage: + self.schema_resolver.add_schema_metadata(dataset_urn, schema_metadata) description, properties, _ = self.get_table_properties(inspector, schema, view) try: view_definition = inspector.get_view_definition(view, schema) @@ -937,12 +963,9 @@ def _process_view( view_definition = "" properties["view_definition"] = view_definition properties["is_view"] = "True" - dataset_urn = make_dataset_urn_with_platform_instance( - self.platform, - dataset_name, - self.config.platform_instance, - self.config.env, - ) + if view_definition and self.config.include_view_lineage: + self._view_definition_cache[dataset_name] = view_definition + dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[StatusClass(removed=False)], @@ -989,6 +1012,51 @@ def _process_view( domain_registry=self.domain_registry, ) + def _run_sql_parser( + self, view_identifier: str, query: str, schema_resolver: SchemaResolver + ) -> Optional[SqlParsingResult]: + try: + database, schema = self.get_db_schema(view_identifier) + except ValueError: + logger.warning(f"Invalid view identifier: {view_identifier}") + return None + raw_lineage = sqlglot_lineage( + query, + schema_resolver=schema_resolver, + default_db=database, + default_schema=schema, + ) + view_urn = make_dataset_urn_with_platform_instance( + self.platform, + view_identifier, + self.config.platform_instance, + self.config.env, + ) + + if raw_lineage.debug_info.table_error: + logger.debug( + f"Failed to parse lineage for view {view_identifier}: " + f"{raw_lineage.debug_info.table_error}" + ) + self.report.num_view_definitions_failed_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Table-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.table_error}" + ) + return None + + elif raw_lineage.debug_info.column_error: + self.report.num_view_definitions_failed_column_parsing += 1 + self.report.view_definitions_parsing_failures.append( + f"Column-level sql parsing error for view {view_identifier}: {raw_lineage.debug_info.column_error}" + ) + else: + self.report.num_view_definitions_parsed += 1 + return view_definition_lineage_helper(raw_lineage, view_urn) + + def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]: + database, schema, _view = dataset_identifier.split(".") + return database, schema + def get_profiler_instance(self, inspector: Inspector) -> "DatahubGEProfiler": from datahub.ingestion.source.ge_data_profiler import DatahubGEProfiler diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py index 8f1e04b915f3b2..095b8e64431719 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_config.py @@ -1,14 +1,17 @@ import logging from abc import abstractmethod from typing import Any, Dict, Optional -from urllib.parse import quote_plus import pydantic from pydantic import Field +from sqlalchemy.engine import URL -from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.common import AllowDenyPattern, ConfigModel, LineageConfig +from datahub.configuration.source_common import ( + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, +) +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig from datahub.ingestion.source.state.stale_entity_removal_handler import ( StatefulStaleMetadataRemovalConfig, @@ -21,7 +24,12 @@ logger: logging.Logger = logging.getLogger(__name__) -class SQLCommonConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): +class SQLCommonConfig( + StatefulIngestionConfigBase, + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, + LineageConfig, +): options: dict = pydantic.Field( default_factory=dict, description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", @@ -63,6 +71,22 @@ class SQLCommonConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): description="If the source supports it, include table lineage to the underlying storage location.", ) + include_view_lineage: bool = Field( + default=True, + description="Populates view->view and table->view lineage using DataHub's sql parser.", + ) + + include_view_column_lineage: bool = Field( + default=True, + description="Populates column-level lineage for view->view and table->view lineage using DataHub's sql parser." + " Requires `include_view_lineage` to be enabled.", + ) + + use_file_backed_cache: bool = Field( + default=True, + description="Whether to use a file backed cache for the view definitions.", + ) + profiling: GEProfilingConfig = GEProfilingConfig() # Custom Stateful Ingestion settings stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = None @@ -118,7 +142,11 @@ class SQLAlchemyConnectionConfig(ConfigModel): # Duplicate of SQLCommonConfig.options options: dict = pydantic.Field( default_factory=dict, - description="Any options specified here will be passed to [SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs.", + description=( + "Any options specified here will be passed to " + "[SQLAlchemy.create_engine](https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine) as kwargs." + " To set connection arguments in the URL, specify them under `connect_args`." + ), ) _database_alias_deprecation = pydantic_field_deprecated( @@ -154,21 +182,26 @@ def make_sqlalchemy_uri( db: Optional[str], uri_opts: Optional[Dict[str, Any]] = None, ) -> str: - url = f"{scheme}://" - if username is not None: - url += f"{quote_plus(username)}" - if password is not None: - url += f":{quote_plus(password)}" - url += "@" - if at is not None: - url += f"{at}" - if db is not None: - url += f"/{db}" - if uri_opts is not None: - if db is None: - url += "/" - params = "&".join( - f"{key}={quote_plus(value)}" for (key, value) in uri_opts.items() if value + host: Optional[str] = None + port: Optional[int] = None + if at: + try: + host, port_str = at.rsplit(":", 1) + port = int(port_str) + except ValueError: + host = at + port = None + if uri_opts: + uri_opts = {k: v for k, v in uri_opts.items() if v is not None} + + return str( + URL.create( + drivername=scheme, + username=username, + password=password, + host=host, + port=port, + database=db, + query=uri_opts or {}, ) - url = f"{url}?{params}" - return url + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py index 344c114d464a92..aaeee5717a867c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sql_generic_profiler.py @@ -1,12 +1,15 @@ import logging +from abc import abstractmethod from dataclasses import dataclass, field from datetime import datetime, timedelta, timezone -from typing import Dict, Iterable, List, Optional, Tuple, Union, cast +from typing import Dict, Iterable, List, Optional, Union, cast from sqlalchemy import create_engine, inspect from sqlalchemy.engine.reflection import Inspector from datahub.emitter.mce_builder import make_dataset_urn_with_platform_instance +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.ge_data_profiler import ( DatahubGEProfiler, GEProfilerRequest, @@ -16,7 +19,7 @@ from datahub.ingestion.source.sql.sql_generic import BaseTable, BaseView from datahub.ingestion.source.state.profiling_state_handler import ProfilingHandler from datahub.metadata.com.linkedin.pegasus2avro.dataset import DatasetProfile -from datahub.metadata.schema_classes import DatasetProfileClass +from datahub.metadata.com.linkedin.pegasus2avro.timeseries import PartitionType from datahub.utilities.stats_collections import TopKDict, int_top_k_dict @@ -63,14 +66,14 @@ def __init__( self.platform = platform self.state_handler = state_handler - def generate_profiles( + def generate_profile_workunits( self, requests: List[TableProfilerRequest], max_workers: int, db_name: Optional[str] = None, platform: Optional[str] = None, profiler_args: Optional[Dict] = None, - ) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]: + ) -> Iterable[MetadataWorkUnit]: ge_profile_requests: List[GEProfilerRequest] = [ cast(GEProfilerRequest, request) for request in requests @@ -80,21 +83,109 @@ def generate_profiles( request for request in requests if request.profile_table_level_only ] for request in table_level_profile_requests: - profile = DatasetProfile( + table_level_profile = DatasetProfile( timestampMillis=int(datetime.now().timestamp() * 1000), columnCount=request.table.column_count, rowCount=request.table.rows_count, sizeInBytes=request.table.size_in_bytes, ) - yield (request, profile) + dataset_urn = self.dataset_urn_builder(request.pretty_name) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=table_level_profile + ).as_workunit() if not ge_profile_requests: return # Otherwise, if column level profiling is enabled, use GE profiler. ge_profiler = self.get_profiler_instance(db_name) - yield from ge_profiler.generate_profiles( + + for ge_profiler_request, profile in ge_profiler.generate_profiles( ge_profile_requests, max_workers, platform, profiler_args + ): + if profile is None: + continue + + request = cast(TableProfilerRequest, ge_profiler_request) + profile.sizeInBytes = request.table.size_in_bytes + + # If table is partitioned we profile only one partition (if nothing set then the last one) + # but for table level we can use the rows_count from the table metadata + # This way even though column statistics only reflects one partition data but the rows count + # shows the proper count. + if ( + profile.partitionSpec + and profile.partitionSpec.type != PartitionType.FULL_TABLE + ): + profile.rowCount = request.table.rows_count + + dataset_urn = self.dataset_urn_builder(request.pretty_name) + + # We don't add to the profiler state if we only do table level profiling as it always happens + if self.state_handler: + self.state_handler.add_to_state( + dataset_urn, int(datetime.now().timestamp() * 1000) + ) + yield MetadataChangeProposalWrapper( + entityUrn=dataset_urn, aspect=profile + ).as_workunit() + + def dataset_urn_builder(self, dataset_name: str) -> str: + return make_dataset_urn_with_platform_instance( + self.platform, + dataset_name, + self.config.platform_instance, + self.config.env, + ) + + @abstractmethod + def get_dataset_name(self, table_name: str, schema_name: str, db_name: str) -> str: + pass + + def get_profile_request( + self, table: BaseTable, schema_name: str, db_name: str + ) -> Optional[TableProfilerRequest]: + skip_profiling = False + profile_table_level_only = self.config.profiling.profile_table_level_only + dataset_name = self.get_dataset_name(table.name, schema_name, db_name) + if not self.is_dataset_eligible_for_profiling( + dataset_name, table.last_altered, table.size_in_bytes, table.rows_count + ): + # Profile only table level if dataset is filtered from profiling + # due to size limits alone + if self.is_dataset_eligible_for_profiling( + dataset_name, table.last_altered, 0, 0 + ): + profile_table_level_only = True + else: + skip_profiling = True + self.report.num_tables_not_eligible_profiling[ + f"{db_name}.{schema_name}" + ] += 1 + + if table.column_count == 0: + skip_profiling = True + + if skip_profiling: + if self.config.profiling.report_dropped_profiles: + self.report.report_dropped(f"profile of {dataset_name}") + return None + + logger.debug(f"Preparing profiling request for {dataset_name}") + profile_request = TableProfilerRequest( + pretty_name=dataset_name, + batch_kwargs=self.get_batch_kwargs(table, schema_name, db_name), + table=table, + profile_table_level_only=profile_table_level_only, + ) + return profile_request + + def get_batch_kwargs( + self, table: BaseTable, schema_name: str, db_name: str + ) -> dict: + return dict( + schema=schema_name, + table=table.name, ) def get_inspectors(self) -> Iterable[Inspector]: diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py b/metadata-ingestion/src/datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py new file mode 100644 index 00000000000000..b6a463837228db --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/sqlalchemy_uri_mapper.py @@ -0,0 +1,47 @@ +from collections import OrderedDict +from typing import Callable, Dict, Optional, Tuple + + +def _platform_alchemy_uri_tester_gen( + platform: str, opt_starts_with: Optional[str] = None +) -> Tuple[str, Callable[[str], bool]]: + return platform, lambda x: x.startswith(opt_starts_with or platform) + + +PLATFORM_TO_SQLALCHEMY_URI_TESTER_MAP: Dict[str, Callable[[str], bool]] = OrderedDict( + [ + _platform_alchemy_uri_tester_gen("athena", "awsathena"), + _platform_alchemy_uri_tester_gen("bigquery"), + _platform_alchemy_uri_tester_gen("clickhouse"), + _platform_alchemy_uri_tester_gen("druid"), + _platform_alchemy_uri_tester_gen("hana"), + _platform_alchemy_uri_tester_gen("hive"), + _platform_alchemy_uri_tester_gen("mongodb"), + _platform_alchemy_uri_tester_gen("mssql"), + _platform_alchemy_uri_tester_gen("mysql"), + _platform_alchemy_uri_tester_gen("oracle"), + _platform_alchemy_uri_tester_gen("pinot"), + _platform_alchemy_uri_tester_gen("presto"), + ( + "redshift", + lambda x: ( + x.startswith(("jdbc:postgres:", "postgresql")) + and x.find("redshift.amazonaws") > 0 + ) + or x.startswith("redshift"), + ), + # Don't move this before redshift. + _platform_alchemy_uri_tester_gen("postgres", "postgresql"), + _platform_alchemy_uri_tester_gen("snowflake"), + _platform_alchemy_uri_tester_gen("sqlite"), + _platform_alchemy_uri_tester_gen("trino"), + _platform_alchemy_uri_tester_gen("vertica"), + ] +) + + +def get_platform_from_sqlalchemy_uri(sqlalchemy_uri: str) -> str: + for platform, tester in PLATFORM_TO_SQLALCHEMY_URI_TESTER_MAP.items(): + if tester(sqlalchemy_uri): + return platform + return "external" diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py new file mode 100644 index 00000000000000..899a7b6697c0a5 --- /dev/null +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/teradata.py @@ -0,0 +1,223 @@ +import logging +from dataclasses import dataclass +from datetime import datetime +from typing import Iterable, Optional, Union + +# This import verifies that the dependencies are available. +import teradatasqlalchemy # noqa: F401 +import teradatasqlalchemy.types as custom_types +from pydantic.fields import Field +from sqlalchemy import create_engine +from sqlalchemy.engine import Engine + +from datahub.configuration.common import AllowDenyPattern +from datahub.configuration.time_window_config import BaseTimeWindowConfig +from datahub.emitter.sql_parsing_builder import SqlParsingBuilder +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.graph.client import DataHubGraph +from datahub.ingestion.source.sql.sql_common import SqlWorkUnit, register_custom_type +from datahub.ingestion.source.sql.sql_generic_profiler import ProfilingSqlReport +from datahub.ingestion.source.sql.two_tier_sql_source import ( + TwoTierSQLAlchemyConfig, + TwoTierSQLAlchemySource, +) +from datahub.ingestion.source.usage.usage_common import BaseUsageConfig +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport +from datahub.ingestion.source_report.time_window import BaseTimeWindowReport +from datahub.metadata.com.linkedin.pegasus2avro.schema import ( + BytesTypeClass, + TimeTypeClass, +) +from datahub.utilities.sqlglot_lineage import SchemaResolver, sqlglot_lineage + +logger: logging.Logger = logging.getLogger(__name__) + +register_custom_type(custom_types.JSON, BytesTypeClass) +register_custom_type(custom_types.INTERVAL_DAY, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_DAY_TO_SECOND, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_DAY_TO_MINUTE, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_DAY_TO_HOUR, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_SECOND, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_MINUTE, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_MINUTE_TO_SECOND, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_HOUR, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_HOUR_TO_MINUTE, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_HOUR_TO_SECOND, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_MONTH, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_YEAR, TimeTypeClass) +register_custom_type(custom_types.INTERVAL_YEAR_TO_MONTH, TimeTypeClass) +register_custom_type(custom_types.MBB, BytesTypeClass) +register_custom_type(custom_types.MBR, BytesTypeClass) +register_custom_type(custom_types.GEOMETRY, BytesTypeClass) +register_custom_type(custom_types.TDUDT, BytesTypeClass) +register_custom_type(custom_types.XML, BytesTypeClass) + + +@dataclass +class TeradataReport(ProfilingSqlReport, IngestionStageReport, BaseTimeWindowReport): + num_queries_parsed: int = 0 + num_view_ddl_parsed: int = 0 + num_table_parse_failures: int = 0 + + +class BaseTeradataConfig(TwoTierSQLAlchemyConfig): + scheme = Field(default="teradatasql", description="database scheme") + + +class TeradataConfig(BaseTeradataConfig, BaseTimeWindowConfig): + database_pattern = Field( + default=AllowDenyPattern(deny=["dbc"]), + description="Regex patterns for databases to filter in ingestion.", + ) + include_table_lineage = Field( + default=False, + description="Whether to include table lineage in the ingestion. " + "This requires to have the table lineage feature enabled.", + ) + + usage: BaseUsageConfig = Field( + description="The usage config to use when generating usage statistics", + default=BaseUsageConfig(), + ) + + default_db: Optional[str] = Field( + default=None, + description="The default database to use for unqualified table names", + ) + + include_usage_statistics: bool = Field( + default=False, + description="Generate usage statistic.", + ) + + +@platform_name("Teradata") +@config_class(TeradataConfig) +@support_status(SupportStatus.TESTING) +@capability(SourceCapability.DOMAINS, "Enabled by default") +@capability(SourceCapability.CONTAINERS, "Enabled by default") +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DELETION_DETECTION, "Optionally enabled via configuration") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") +@capability(SourceCapability.LINEAGE_FINE, "Optionally enabled via configuration") +@capability(SourceCapability.USAGE_STATS, "Optionally enabled via configuration") +class TeradataSource(TwoTierSQLAlchemySource): + """ + This plugin extracts the following: + + - Metadata for databases, schemas, views, and tables + - Column types associated with each table + - Table, row, and column statistics via optional SQL profiling + """ + + config: TeradataConfig + + LINEAGE_QUERY: str = """SELECT ProcID, UserName as "user", StartTime AT TIME ZONE 'GMT' as "timestamp", DefaultDatabase as default_database, QueryText as query + FROM "DBC".DBQLogTbl + where ErrorCode = 0 + and QueryText like 'create table demo_user.test_lineage%' + and "timestamp" >= TIMESTAMP '{start_time}' + and "timestamp" < TIMESTAMP '{end_time}' + """ + + def __init__(self, config: TeradataConfig, ctx: PipelineContext): + super().__init__(config, ctx, "teradata") + + self.report: TeradataReport = TeradataReport() + self.graph: Optional[DataHubGraph] = ctx.graph + + self.builder: SqlParsingBuilder = SqlParsingBuilder( + usage_config=self.config.usage + if self.config.include_usage_statistics + else None, + generate_lineage=True, + generate_usage_statistics=self.config.include_usage_statistics, + generate_operations=self.config.usage.include_operational_stats, + ) + + self.schema_resolver = SchemaResolver( + platform=self.platform, + platform_instance=self.config.platform_instance, + graph=None, + env=self.config.env, + ) + + @classmethod + def create(cls, config_dict, ctx): + config = TeradataConfig.parse_obj(config_dict) + return cls(config, ctx) + + def get_audit_log_mcps(self) -> Iterable[MetadataWorkUnit]: + engine = self.get_metadata_engine() + for entry in engine.execute( + self.LINEAGE_QUERY.format( + start_time=self.config.start_time, end_time=self.config.end_time + ) + ): + self.report.num_queries_parsed += 1 + if self.report.num_queries_parsed % 1000 == 0: + logger.info(f"Parsed {self.report.num_queries_parsed} queries") + + yield from self.gen_lineage_from_query( + query=entry.query, + default_database=entry.default_database, + timestamp=entry.timestamp, + user=entry.user, + is_view_ddl=False, + ) + + def gen_lineage_from_query( + self, + query: str, + default_database: Optional[str] = None, + timestamp: Optional[datetime] = None, + user: Optional[str] = None, + is_view_ddl: bool = False, + ) -> Iterable[MetadataWorkUnit]: + result = sqlglot_lineage( + sql=query, + schema_resolver=self.schema_resolver, + default_db=None, + default_schema=default_database + if default_database + else self.config.default_db, + ) + if result.debug_info.table_error: + logger.debug( + f"Error parsing table lineage, {result.debug_info.table_error}" + ) + self.report.num_table_parse_failures += 1 + else: + yield from self.builder.process_sql_parsing_result( + result, + query=query, + is_view_ddl=is_view_ddl, + query_timestamp=timestamp, + user=f"urn:li:corpuser:{user}", + include_urns=self.schema_resolver.get_urns(), + ) + + def get_metadata_engine(self) -> Engine: + url = self.config.get_sql_alchemy_url() + logger.debug(f"sql_alchemy_url={url}") + return create_engine(url, **self.config.options) + + def get_workunits_internal(self) -> Iterable[Union[MetadataWorkUnit, SqlWorkUnit]]: + # Add all schemas to the schema resolver + yield from super().get_workunits_internal() + + if self.config.include_table_lineage or self.config.include_usage_statistics: + self.report.report_ingestion_stage_start("audit log extraction") + yield from self.get_audit_log_mcps() + + yield from self.builder.gen_workunits() diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py index d9062cef06eae0..efb1d3ffe119fc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/two_tier_sql_source.py @@ -1,8 +1,10 @@ import typing -from typing import Any, Dict, Iterable, Optional +import urllib.parse +from typing import Any, Dict, Iterable, Optional, Tuple from pydantic.fields import Field from sqlalchemy import create_engine, inspect +from sqlalchemy.engine import URL from sqlalchemy.engine.reflection import Inspector from datahub.configuration.common import AllowDenyPattern @@ -41,14 +43,27 @@ def get_sql_alchemy_url( uri_opts: typing.Optional[typing.Dict[str, typing.Any]] = None, current_db: typing.Optional[str] = None, ) -> str: - return self.sqlalchemy_uri or make_sqlalchemy_uri( - self.scheme, - self.username, - self.password.get_secret_value() if self.password else None, - self.host_port, - current_db if current_db else self.database, - uri_opts=uri_opts, - ) + if self.sqlalchemy_uri: + parsed_url = urllib.parse.urlsplit(self.sqlalchemy_uri) + url = URL.create( + drivername=parsed_url.scheme, + username=parsed_url.username, + password=parsed_url.password, + host=parsed_url.hostname, + port=parsed_url.port, + database=current_db or parsed_url.path.lstrip("/"), + query=urllib.parse.parse_qs(parsed_url.query), + ).update_query_dict(uri_opts or {}) + return str(url) + else: + return make_sqlalchemy_uri( + self.scheme, + self.username, + self.password.get_secret_value() if self.password else None, + self.host_port, + current_db or self.database, + uri_opts=uri_opts, + ) class TwoTierSQLAlchemySource(SQLAlchemySource): @@ -56,6 +71,10 @@ def __init__(self, config, ctx, platform): super().__init__(config, ctx, platform) self.config: TwoTierSQLAlchemyConfig = config + def get_db_schema(self, dataset_identifier: str) -> Tuple[Optional[str], str]: + schema, _view = dataset_identifier.split(".", 1) + return None, schema + def get_database_container_key(self, db_name: str, schema: str) -> ContainerKey: # Because our overridden get_allowed_schemas method returns db_name as the schema name, # the db_name and schema here will be the same. Hence, we just ignore the schema parameter. diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py index a417cae2b1ab03..b89db755853bc3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/vertica.py @@ -86,7 +86,7 @@ class VerticaConfig(BasicSQLAlchemyConfig): default=True, description="Whether Models should be ingested." ) - include_view_lineage: Optional[bool] = pydantic.Field( + include_view_lineage: bool = pydantic.Field( default=True, description="If the source supports it, include view lineage to the underlying storage location.", ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py index 2fcc93292c2efe..fcf97e461967c7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql_queries.py @@ -20,11 +20,17 @@ from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( SupportStatus, + capability, config_class, platform_name, support_status, ) -from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source, SourceReport +from datahub.ingestion.api.source import ( + MetadataWorkUnitProcessor, + Source, + SourceCapability, + SourceReport, +) from datahub.ingestion.api.source_helpers import auto_workunit_reporter from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.graph.client import DataHubGraph @@ -83,6 +89,8 @@ def compute_stats(self) -> None: @platform_name("SQL Queries") @config_class(SqlQueriesSourceConfig) @support_status(SupportStatus.TESTING) +@capability(SourceCapability.LINEAGE_COARSE, "Parsed from SQL queries") +@capability(SourceCapability.LINEAGE_FINE, "Parsed from SQL queries") class SqlQueriesSource(Source): # TODO: Documentation urns: Optional[Set[str]] @@ -103,13 +111,12 @@ def __init__(self, ctx: PipelineContext, config: SqlQueriesSourceConfig): self.builder = SqlParsingBuilder(usage_config=self.config.usage) if self.config.use_schema_resolver: - schema_resolver, urns = self.graph.initialize_schema_resolver_from_datahub( + self.schema_resolver = self.graph.initialize_schema_resolver_from_datahub( platform=self.config.platform, platform_instance=self.config.platform_instance, env=self.config.env, ) - self.schema_resolver = schema_resolver - self.urns = urns + self.urns = self.schema_resolver.get_urns() else: self.schema_resolver = self.graph._make_schema_resolver( platform=self.config.platform, diff --git a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py index be97e9380f1f57..7fb2cf9813cab1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py +++ b/metadata-ingestion/src/datahub/ingestion/source/state/stateful_ingestion_base.py @@ -11,7 +11,6 @@ ConfigModel, ConfigurationError, DynamicTypedConfig, - LineageConfig, ) from datahub.configuration.time_window_config import BaseTimeWindowConfig from datahub.configuration.validate_field_rename import pydantic_renamed_field @@ -100,7 +99,7 @@ class StatefulIngestionConfigBase(GenericModel, Generic[CustomConfig]): ) -class StatefulLineageConfigMixin(LineageConfig): +class StatefulLineageConfigMixin: enable_stateful_lineage_ingestion: bool = Field( default=True, description="Enable stateful lineage ingestion." diff --git a/metadata-ingestion/src/datahub/ingestion/source/superset.py b/metadata-ingestion/src/datahub/ingestion/source/superset.py index 2a4563439b6bab..e491a1e8b82fa3 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/superset.py +++ b/metadata-ingestion/src/datahub/ingestion/source/superset.py @@ -21,7 +21,9 @@ ) from datahub.ingestion.api.source import MetadataWorkUnitProcessor, Source from datahub.ingestion.api.workunit import MetadataWorkUnit -from datahub.ingestion.source.sql import sql_common +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalHandler, StaleEntityRemovalSourceReport, @@ -140,6 +142,7 @@ def get_filter_name(filter_obj): @capability( SourceCapability.DELETION_DETECTION, "Optionally enabled via stateful_ingestion" ) +@capability(SourceCapability.LINEAGE_COARSE, "Supported by default") class SupersetSource(StatefulIngestionSourceBase): """ This plugin extracts the following: @@ -202,7 +205,7 @@ def get_platform_from_database_id(self, database_id): sqlalchemy_uri = database_response.get("result", {}).get("sqlalchemy_uri") if sqlalchemy_uri is None: return database_response.get("result", {}).get("backend", "external") - return sql_common.get_platform_from_sqlalchemy_uri(sqlalchemy_uri) + return get_platform_from_sqlalchemy_uri(sqlalchemy_uri) @lru_cache(maxsize=None) def get_datasource_urn_from_id(self, datasource_id): diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py index 4cc00a66116e93..4bc40b0aac9649 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py @@ -37,11 +37,11 @@ ConfigModel, ConfigurationError, ) -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated from datahub.configuration.source_common import ( DatasetLineageProviderConfigBase, DatasetSourceConfigMixin, ) +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( ContainerKey, @@ -77,6 +77,7 @@ FIELD_TYPE_MAPPING, MetadataQueryException, TableauLineageOverrides, + TableauUpstreamReference, clean_query, custom_sql_graphql_query, dashboard_graphql_query, @@ -85,7 +86,6 @@ get_overridden_info, get_unique_custom_sql, make_fine_grained_lineage_class, - make_table_urn, make_upstream_class, published_datasource_graphql_query, query_metadata, @@ -271,7 +271,7 @@ class TableauConfig( "You can change this if your Tableau projects contain slashes in their names, and you'd like to filter by project.", ) - default_schema_map: dict = Field( + default_schema_map: Dict[str, str] = Field( default={}, description="Default schema to use when schema is not found." ) ingest_tags: Optional[bool] = Field( @@ -452,6 +452,10 @@ class TableauSourceReport(StaleEntityRemovalSourceReport): @capability(SourceCapability.OWNERSHIP, "Requires recipe configuration") @capability(SourceCapability.TAGS, "Requires recipe configuration") @capability(SourceCapability.LINEAGE_COARSE, "Enabled by default") +@capability( + SourceCapability.LINEAGE_FINE, + "Enabled by default, configure using `extract_column_level_lineage`", +) class TableauSource(StatefulIngestionSourceBase): platform = "tableau" @@ -533,7 +537,7 @@ def fetch_projects(): path=[], ) # Set parent project name - for project_id, project in all_project_map.items(): + for _project_id, project in all_project_map.items(): if ( project.parent_id is not None and project.parent_id in all_project_map @@ -997,41 +1001,16 @@ def get_upstream_tables( ) continue - schema = table.get(tableau_constant.SCHEMA) or "" - table_name = table.get(tableau_constant.NAME) or "" - full_name = table.get(tableau_constant.FULL_NAME) or "" - upstream_db = ( - table[tableau_constant.DATABASE][tableau_constant.NAME] - if table.get(tableau_constant.DATABASE) - and table[tableau_constant.DATABASE].get(tableau_constant.NAME) - else "" - ) - logger.debug( - "Processing Table with Connection Type: {0} and id {1}".format( - table.get(tableau_constant.CONNECTION_TYPE) or "", - table.get(tableau_constant.ID) or "", + try: + ref = TableauUpstreamReference.create( + table, default_schema_map=self.config.default_schema_map ) - ) - schema = self._get_schema(schema, upstream_db, full_name) - # if the schema is included within the table name we omit it - if ( - schema - and table_name - and full_name - and table_name == full_name - and schema in table_name - ): - logger.debug( - f"Omitting schema for upstream table {table[tableau_constant.ID]}, schema included in table name" - ) - schema = "" + except Exception as e: + logger.info(f"Failed to generate upstream reference for {table}: {e}") + continue - table_urn = make_table_urn( + table_urn = ref.make_dataset_urn( self.config.env, - upstream_db, - table.get(tableau_constant.CONNECTION_TYPE) or "", - schema, - table_name, self.config.platform_instance_map, self.config.lineage_overrides, ) @@ -1052,7 +1031,7 @@ def get_upstream_tables( urn=table_urn, id=table[tableau_constant.ID], num_cols=num_tbl_cols, - paths=set([table_path]) if table_path else set(), + paths={table_path} if table_path else set(), ) else: self.database_tables[table_urn].update_table( @@ -1179,8 +1158,6 @@ def get_upstream_fields_of_field_in_datasource( def get_upstream_fields_from_custom_sql( self, datasource: dict, datasource_urn: str ) -> List[FineGrainedLineage]: - fine_grained_lineages: List[FineGrainedLineage] = [] - parsed_result = self.parse_custom_sql( datasource=datasource, datasource_urn=datasource_urn, @@ -1194,13 +1171,20 @@ def get_upstream_fields_from_custom_sql( logger.info( f"Failed to extract column level lineage from datasource {datasource_urn}" ) - return fine_grained_lineages + return [] + if parsed_result.debug_info.error: + logger.info( + f"Failed to extract column level lineage from datasource {datasource_urn}: {parsed_result.debug_info.error}" + ) + return [] cll: List[ColumnLineageInfo] = ( parsed_result.column_lineage if parsed_result.column_lineage is not None else [] ) + + fine_grained_lineages: List[FineGrainedLineage] = [] for cll_info in cll: downstream = ( [ @@ -2457,35 +2441,6 @@ def emit_embedded_datasources(self) -> Iterable[MetadataWorkUnit]: is_embedded_ds=True, ) - @lru_cache(maxsize=None) - def _get_schema(self, schema_provided: str, database: str, fullName: str) -> str: - # For some databases, the schema attribute in tableau api does not return - # correct schema name for the table. For more information, see - # https://help.tableau.com/current/api/metadata_api/en-us/docs/meta_api_model.html#schema_attribute. - # Hence we extract schema from fullName whenever fullName is available - schema = self._extract_schema_from_fullName(fullName) if fullName else "" - if not schema: - schema = schema_provided - elif schema != schema_provided: - logger.debug( - "Correcting schema, provided {0}, corrected {1}".format( - schema_provided, schema - ) - ) - - if not schema and database in self.config.default_schema_map: - schema = self.config.default_schema_map[database] - - return schema - - @lru_cache(maxsize=None) - def _extract_schema_from_fullName(self, fullName: str) -> str: - # fullName is observed to be in format [schemaName].[tableName] - # OR simply tableName OR [tableName] - if fullName.startswith("[") and "].[" in fullName: - return fullName[1 : fullName.index("]")] - return "" - @lru_cache(maxsize=None) def get_last_modified( self, creator: Optional[str], created_at: bytes, updated_at: bytes diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py index 2c92285fdba77a..7c4852042ce7c8 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py +++ b/metadata-ingestion/src/datahub/ingestion/source/tableau_common.py @@ -1,4 +1,6 @@ import html +import logging +from dataclasses import dataclass from functools import lru_cache from typing import Dict, List, Optional, Tuple @@ -6,6 +8,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.common import ConfigModel +from datahub.ingestion.source import tableau_constant as tc from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageType, FineGrainedLineage, @@ -31,6 +34,8 @@ ) from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult +logger = logging.getLogger(__name__) + class TableauLineageOverrides(ConfigModel): platform_override_map: Optional[Dict[str, str]] = Field( @@ -537,12 +542,12 @@ def get_fully_qualified_table_name( platform: str, upstream_db: str, schema: str, - full_name: str, + table_name: str, ) -> str: if platform == "athena": upstream_db = "" database_name = f"{upstream_db}." if upstream_db else "" - final_name = full_name.replace("[", "").replace("]", "") + final_name = table_name.replace("[", "").replace("]", "") schema_name = f"{schema}." if schema else "" @@ -573,17 +578,123 @@ def get_fully_qualified_table_name( return fully_qualified_table_name -def get_platform_instance( - platform: str, platform_instance_map: Optional[Dict[str, str]] -) -> Optional[str]: - if platform_instance_map is not None and platform in platform_instance_map.keys(): - return platform_instance_map[platform] +@dataclass +class TableauUpstreamReference: + database: Optional[str] + schema: Optional[str] + table: str + + connection_type: str + + @classmethod + def create( + cls, d: dict, default_schema_map: Optional[Dict[str, str]] = None + ) -> "TableauUpstreamReference": + # Values directly from `table` object from Tableau + database = t_database = d.get(tc.DATABASE, {}).get(tc.NAME) + schema = t_schema = d.get(tc.SCHEMA) + table = t_table = d.get(tc.NAME) or "" + t_full_name = d.get(tc.FULL_NAME) + t_connection_type = d[tc.CONNECTION_TYPE] # required to generate urn + t_id = d[tc.ID] + + parsed_full_name = cls.parse_full_name(t_full_name) + if parsed_full_name and len(parsed_full_name) == 3: + database, schema, table = parsed_full_name + elif parsed_full_name and len(parsed_full_name) == 2: + schema, table = parsed_full_name + else: + logger.debug( + f"Upstream urn generation ({t_id}):" + f" Did not parse full name {t_full_name}: unexpected number of values", + ) + + if not schema and default_schema_map and database in default_schema_map: + schema = default_schema_map[database] + + if database != t_database: + logger.debug( + f"Upstream urn generation ({t_id}):" + f" replacing database {t_database} with {database} from full name {t_full_name}" + ) + if schema != t_schema: + logger.debug( + f"Upstream urn generation ({t_id}):" + f" replacing schema {t_schema} with {schema} from full name {t_full_name}" + ) + if table != t_table: + logger.debug( + f"Upstream urn generation ({t_id}):" + f" replacing table {t_table} with {table} from full name {t_full_name}" + ) + + # TODO: See if we can remove this -- made for redshift + if ( + schema + and t_table + and t_full_name + and t_table == t_full_name + and schema in t_table + ): + logger.debug( + f"Omitting schema for upstream table {t_id}, schema included in table name" + ) + schema = "" + + return cls( + database=database, + schema=schema, + table=table, + connection_type=t_connection_type, + ) + + @staticmethod + def parse_full_name(full_name: Optional[str]) -> Optional[List[str]]: + # fullName is observed to be in formats: + # [database].[schema].[table] + # [schema].[table] + # [table] + # table + # schema + + # TODO: Validate the startswith check. Currently required for our integration tests + if full_name is None or not full_name.startswith("["): + return None + + return full_name.replace("[", "").replace("]", "").split(".") + + def make_dataset_urn( + self, + env: str, + platform_instance_map: Optional[Dict[str, str]], + lineage_overrides: Optional[TableauLineageOverrides] = None, + ) -> str: + ( + upstream_db, + platform_instance, + platform, + original_platform, + ) = get_overridden_info( + connection_type=self.connection_type, + upstream_db=self.database, + lineage_overrides=lineage_overrides, + platform_instance_map=platform_instance_map, + ) + + table_name = get_fully_qualified_table_name( + original_platform, + upstream_db or "", + self.schema, + self.table, + ) - return None + return builder.make_dataset_urn_with_platform_instance( + platform, table_name, platform_instance, env + ) def get_overridden_info( - connection_type: str, + connection_type: Optional[str], upstream_db: Optional[str], platform_instance_map: Optional[Dict[str, str]], lineage_overrides: Optional[TableauLineageOverrides] = None, @@ -605,7 +716,9 @@ def get_overridden_info( ): upstream_db = lineage_overrides.database_override_map[upstream_db] - platform_instance = get_platform_instance(original_platform, platform_instance_map) + platform_instance = ( + platform_instance_map.get(original_platform) if platform_instance_map else None + ) if original_platform in ("athena", "hive", "mysql"): # Two tier databases upstream_db = None @@ -613,35 +726,6 @@ def get_overridden_info( return upstream_db, platform_instance, platform, original_platform -def make_table_urn( - env: str, - upstream_db: Optional[str], - connection_type: str, - schema: str, - full_name: str, - platform_instance_map: Optional[Dict[str, str]], - lineage_overrides: Optional[TableauLineageOverrides] = None, -) -> str: - - upstream_db, platform_instance, platform, original_platform = get_overridden_info( - connection_type=connection_type, - upstream_db=upstream_db, - lineage_overrides=lineage_overrides, - platform_instance_map=platform_instance_map, - ) - - table_name = get_fully_qualified_table_name( - original_platform, - upstream_db if upstream_db is not None else "", - schema, - full_name, - ) - - return builder.make_dataset_urn_with_platform_instance( - platform, table_name, platform_instance, env - ) - - def make_description_from_params(description, formula): """ Generate column description diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py index 94ff755e3b2541..16820c37d546ef 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/config.py @@ -1,3 +1,4 @@ +import logging import os from datetime import datetime, timedelta, timezone from typing import Any, Dict, Optional @@ -6,7 +7,10 @@ from pydantic import Field from datahub.configuration.common import AllowDenyPattern, ConfigModel -from datahub.configuration.source_common import DatasetSourceConfigMixin +from datahub.configuration.source_common import ( + DatasetSourceConfigMixin, + LowerCaseDatasetUrnConfigMixin, +) from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.ingestion.source.state.stale_entity_removal_handler import ( @@ -21,6 +25,9 @@ OperationConfig, is_profiling_enabled, ) +from datahub.utilities.global_warning_util import add_global_warning + +logger = logging.getLogger(__name__) class UnityCatalogProfilerConfig(ConfigModel): @@ -87,6 +94,7 @@ class UnityCatalogSourceConfig( BaseUsageConfig, DatasetSourceConfigMixin, StatefulProfilingConfigMixin, + LowerCaseDatasetUrnConfigMixin, ): token: str = pydantic.Field(description="Databricks personal access token") workspace_url: str = pydantic.Field( @@ -97,9 +105,25 @@ class UnityCatalogSourceConfig( description="Name of the workspace. Default to deployment name present in workspace_url", ) + include_metastore: bool = pydantic.Field( + default=True, + description=( + "Whether to ingest the workspace's metastore as a container and include it in all urns." + " Changing this will affect the urns of all entities in the workspace." + " This will be disabled by default in the future," + " so it is recommended to set this to `False` for new ingestions." + " If you have an existing unity catalog ingestion, you'll want to avoid duplicates by soft deleting existing data." + " If stateful ingestion is enabled, running with `include_metastore: false` should be sufficient." + " Otherwise, we recommend deleting via the cli: `datahub delete --platform databricks` and re-ingesting with `include_metastore: false`." + ), + ) + ingest_data_platform_instance_aspect: Optional[bool] = pydantic.Field( default=False, - description="Option to enable/disable ingestion of the data platform instance aspect. The default data platform instance id for a dataset is workspace_name", + description=( + "Option to enable/disable ingestion of the data platform instance aspect." + " The default data platform instance id for a dataset is workspace_name" + ), ) _only_ingest_assigned_metastore_removed = pydantic_removed_field( @@ -122,16 +146,39 @@ class UnityCatalogSourceConfig( default=AllowDenyPattern.allow_all(), description="Regex patterns for tables to filter in ingestion. Specify regex to match the entire table name in `catalog.schema.table` format. e.g. to match all tables starting with customer in Customer catalog and public schema, use the regex `Customer\\.public\\.customer.*`.", ) + + notebook_pattern: AllowDenyPattern = Field( + default=AllowDenyPattern.allow_all(), + description=( + "Regex patterns for notebooks to filter in ingestion, based on notebook *path*." + " Specify regex to match the entire notebook path in `//.../` format." + " e.g. to match all notebooks in the root Shared directory, use the regex `/Shared/.*`." + ), + ) + domain: Dict[str, AllowDenyPattern] = Field( default=dict(), description='Attach domains to catalogs, schemas or tables during ingestion using regex patterns. Domain key can be a guid like *urn:li:domain:ec428203-ce86-4db3-985d-5a8ee6df32ba* or a string like "Marketing".) If you provide strings, then datahub will attempt to resolve this name to a guid, and will error out if this fails. There can be multiple domain keys specified.', ) - include_table_lineage: Optional[bool] = pydantic.Field( + include_table_lineage: bool = pydantic.Field( default=True, description="Option to enable/disable lineage generation.", ) + include_external_lineage: bool = pydantic.Field( + default=True, + description=( + "Option to enable/disable lineage generation for external tables." + " Only external S3 tables are supported at the moment." + ), + ) + + include_notebooks: bool = pydantic.Field( + default=False, + description="Ingest notebooks, represented as DataHub datasets.", + ) + include_ownership: bool = pydantic.Field( default=False, description="Option to enable/disable ownership generation for metastores, catalogs, schemas, and tables.", @@ -141,11 +188,22 @@ class UnityCatalogSourceConfig( "include_table_ownership", "include_ownership" ) - include_column_lineage: Optional[bool] = pydantic.Field( + include_column_lineage: bool = pydantic.Field( default=True, description="Option to enable/disable lineage generation. Currently we have to call a rest call per column to get column level lineage due to the Databrick api which can slow down ingestion. ", ) + column_lineage_column_limit: int = pydantic.Field( + default=300, + description="Limit the number of columns to get column level lineage. ", + ) + + lineage_max_workers: int = pydantic.Field( + default=5 * (os.cpu_count() or 4), + description="Number of worker threads to use for column lineage thread pool executor. Set to 1 to disable.", + hidden_from_docs=True, + ) + include_usage_statistics: bool = Field( default=True, description="Generate usage statistics.", @@ -177,3 +235,16 @@ def workspace_url_should_start_with_http_scheme(cls, workspace_url: str) -> str: "Workspace URL must start with http scheme. e.g. https://my-workspace.cloud.databricks.com" ) return workspace_url + + @pydantic.validator("include_metastore") + def include_metastore_warning(cls, v: bool) -> bool: + if v: + msg = ( + "`include_metastore` is enabled." + " This is not recommended and will be disabled by default in the future, which is a breaking change." + " All databricks urns will change if you re-ingest with this disabled." + " We recommend soft deleting all databricks data and re-ingesting with `include_metastore` set to `False`." + ) + logger.warning(msg) + add_global_warning(msg) + return v diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py index e92f4ff07b1ad3..3fb77ce512ed24 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy.py @@ -23,6 +23,7 @@ QueryStatementType, QueryStatus, ) +from databricks.sdk.service.workspace import ObjectType import datahub from datahub.ingestion.source.unity.proxy_profiling import ( @@ -32,7 +33,9 @@ ALLOWED_STATEMENT_TYPES, Catalog, Column, + ExternalTableReference, Metastore, + Notebook, Query, Schema, ServicePrincipal, @@ -95,14 +98,13 @@ def __init__( self.report = report def check_basic_connectivity(self) -> bool: - self._workspace_client.metastores.summary() - return True + return bool(self._workspace_client.catalogs.list()) def assigned_metastore(self) -> Metastore: response = self._workspace_client.metastores.summary() return self._create_metastore(response) - def catalogs(self, metastore: Metastore) -> Iterable[Catalog]: + def catalogs(self, metastore: Optional[Metastore]) -> Iterable[Catalog]: response = self._workspace_client.catalogs.list() if not response: logger.info("Catalogs not found") @@ -137,6 +139,21 @@ def service_principals(self) -> Iterable[ServicePrincipal]: for principal in self._workspace_client.service_principals.list(): yield self._create_service_principal(principal) + def workspace_notebooks(self) -> Iterable[Notebook]: + for obj in self._workspace_client.workspace.list("/", recursive=True): + if obj.object_type == ObjectType.NOTEBOOK: + yield Notebook( + id=obj.object_id, + path=obj.path, + language=obj.language, + created_at=datetime.fromtimestamp( + obj.created_at / 1000, tz=timezone.utc + ), + modified_at=datetime.fromtimestamp( + obj.modified_at / 1000, tz=timezone.utc + ), + ) + def query_history( self, start_time: datetime, @@ -153,7 +170,7 @@ def query_history( "start_time_ms": start_time.timestamp() * 1000, "end_time_ms": end_time.timestamp() * 1000, }, - "statuses": [QueryStatus.FINISHED.value], + "statuses": [QueryStatus.FINISHED], "statement_types": [typ.value for typ in ALLOWED_STATEMENT_TYPES], } ) @@ -196,64 +213,79 @@ def _query_history( method, path, body={**body, "page_token": response["next_page_token"]} ) - def list_lineages_by_table(self, table_name: str) -> dict: + def list_lineages_by_table( + self, table_name: str, include_entity_lineage: bool + ) -> dict: """List table lineage by table name.""" return self._workspace_client.api_client.do( method="GET", - path="/api/2.0/lineage-tracking/table-lineage/get", - body={"table_name": table_name}, + path="/api/2.0/lineage-tracking/table-lineage", + body={ + "table_name": table_name, + "include_entity_lineage": include_entity_lineage, + }, ) def list_lineages_by_column(self, table_name: str, column_name: str) -> dict: """List column lineage by table name and column name.""" return self._workspace_client.api_client.do( "GET", - "/api/2.0/lineage-tracking/column-lineage/get", + "/api/2.0/lineage-tracking/column-lineage", body={"table_name": table_name, "column_name": column_name}, ) - def table_lineage(self, table: Table) -> None: + def table_lineage(self, table: Table, include_entity_lineage: bool) -> None: # Lineage endpoint doesn't exists on 2.1 version try: response: dict = self.list_lineages_by_table( - table_name=f"{table.schema.catalog.name}.{table.schema.name}.{table.name}" + table_name=table.ref.qualified_table_name, + include_entity_lineage=include_entity_lineage, ) - table.upstreams = { - TableReference( - table.schema.catalog.metastore.id, - item["catalog_name"], - item["schema_name"], - item["name"], - ): {} - for item in response.get("upstream_tables", []) - } + + for item in response.get("upstreams") or []: + if "tableInfo" in item: + table_ref = TableReference.create_from_lineage( + item["tableInfo"], table.schema.catalog.metastore + ) + if table_ref: + table.upstreams[table_ref] = {} + elif "fileInfo" in item: + external_ref = ExternalTableReference.create_from_lineage( + item["fileInfo"] + ) + if external_ref: + table.external_upstreams.add(external_ref) + + for notebook in item.get("notebookInfos") or []: + table.upstream_notebooks.add(notebook["notebook_id"]) + + for item in response.get("downstreams") or []: + for notebook in item.get("notebookInfos") or []: + table.downstream_notebooks.add(notebook["notebook_id"]) except Exception as e: - logger.error(f"Error getting lineage: {e}") + logger.warning( + f"Error getting lineage on table {table.ref}: {e}", exc_info=True + ) - def get_column_lineage(self, table: Table) -> None: + def get_column_lineage(self, table: Table, column_name: str) -> None: try: - table_lineage_response: dict = self.list_lineages_by_table( - table_name=f"{table.schema.catalog.name}.{table.schema.name}.{table.name}" + response: dict = self.list_lineages_by_column( + table_name=table.ref.qualified_table_name, + column_name=column_name, ) - if table_lineage_response: - for column in table.columns: - response: dict = self.list_lineages_by_column( - table_name=f"{table.schema.catalog.name}.{table.schema.name}.{table.name}", - column_name=column.name, - ) - for item in response.get("upstream_cols", []): - table_ref = TableReference( - table.schema.catalog.metastore.id, - item["catalog_name"], - item["schema_name"], - item["table_name"], - ) - table.upstreams.setdefault(table_ref, {}).setdefault( - column.name, [] - ).append(item["name"]) - + for item in response.get("upstream_cols") or []: + table_ref = TableReference.create_from_lineage( + item, table.schema.catalog.metastore + ) + if table_ref: + table.upstreams.setdefault(table_ref, {}).setdefault( + column_name, [] + ).append(item["name"]) except Exception as e: - logger.error(f"Error getting lineage: {e}") + logger.warning( + f"Error getting column lineage on table {table.ref}, column {column_name}: {e}", + exc_info=True, + ) @staticmethod def _escape_sequence(value: str) -> str: @@ -274,10 +306,13 @@ def _create_metastore( comment=None, ) - def _create_catalog(self, metastore: Metastore, obj: CatalogInfo) -> Catalog: + def _create_catalog( + self, metastore: Optional[Metastore], obj: CatalogInfo + ) -> Catalog: + catalog_name = self._escape_sequence(obj.name) return Catalog( name=obj.name, - id=f"{metastore.id}.{self._escape_sequence(obj.name)}", + id=f"{metastore.id}.{catalog_name}" if metastore else catalog_name, metastore=metastore, comment=obj.comment, owner=obj.owner, diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py index 2b943d8c98e7d7..315c1c0d20186f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/proxy_types.py @@ -1,16 +1,20 @@ # Supported types are available at # https://api-docs.databricks.com/rest/latest/unity-catalog-api-specification-2-1.html?_ga=2.151019001.1795147704.1666247755-2119235717.1666247755 +import dataclasses +import logging from dataclasses import dataclass, field from datetime import datetime -from typing import Dict, List, Optional +from typing import Dict, FrozenSet, List, Optional, Set from databricks.sdk.service.catalog import ( CatalogType, ColumnTypeName, DataSourceFormat, + SecurableType, TableType, ) from databricks.sdk.service.sql import QueryStatementType +from databricks.sdk.service.workspace import Language from datahub.metadata.schema_classes import ( ArrayTypeClass, @@ -26,6 +30,8 @@ TimeTypeClass, ) +logger = logging.getLogger(__name__) + DATA_TYPE_REGISTRY: dict = { ColumnTypeName.BOOLEAN: BooleanTypeClass, ColumnTypeName.BYTE: BytesTypeClass, @@ -66,6 +72,9 @@ ALLOWED_STATEMENT_TYPES = {*OPERATION_STATEMENT_TYPES.keys(), QueryStatementType.SELECT} +NotebookId = int + + @dataclass class CommonProperty: id: str @@ -84,7 +93,7 @@ class Metastore(CommonProperty): @dataclass class Catalog(CommonProperty): - metastore: Metastore + metastore: Optional[Metastore] owner: Optional[str] type: CatalogType @@ -122,7 +131,7 @@ class ServicePrincipal: @dataclass(frozen=True, order=True) class TableReference: - metastore: str + metastore: Optional[str] catalog: str schema: str table: str @@ -130,14 +139,34 @@ class TableReference: @classmethod def create(cls, table: "Table") -> "TableReference": return cls( - table.schema.catalog.metastore.id, + table.schema.catalog.metastore.id + if table.schema.catalog.metastore + else None, table.schema.catalog.name, table.schema.name, table.name, ) + @classmethod + def create_from_lineage( + cls, d: dict, metastore: Optional[Metastore] + ) -> Optional["TableReference"]: + try: + return cls( + metastore.id if metastore else None, + d["catalog_name"], + d["schema_name"], + d.get("table_name", d["name"]), # column vs table query output + ) + except Exception as e: + logger.warning(f"Failed to create TableReference from {d}: {e}") + return None + def __str__(self) -> str: - return f"{self.metastore}.{self.catalog}.{self.schema}.{self.table}" + if self.metastore: + return f"{self.metastore}.{self.catalog}.{self.schema}.{self.table}" + else: + return self.qualified_table_name @property def qualified_table_name(self) -> str: @@ -148,13 +177,41 @@ def external_path(self) -> str: return f"{self.catalog}/{self.schema}/{self.table}" +@dataclass(frozen=True, order=True) +class ExternalTableReference: + path: str + has_permission: bool + name: Optional[str] + type: Optional[SecurableType] + storage_location: Optional[str] + + @classmethod + def create_from_lineage(cls, d: dict) -> Optional["ExternalTableReference"]: + try: + securable_type: Optional[SecurableType] + try: + securable_type = SecurableType(d.get("securable_type", "").lower()) + except ValueError: + securable_type = None + + return cls( + path=d["path"], + has_permission=d.get("has_permission") or True, + name=d.get("securable_name"), + type=securable_type, + storage_location=d.get("storage_location"), + ) + except Exception as e: + logger.warning(f"Failed to create ExternalTableReference from {d}: {e}") + return None + + @dataclass class Table(CommonProperty): schema: Schema columns: List[Column] storage_location: Optional[str] data_source_format: Optional[DataSourceFormat] - comment: Optional[str] table_type: TableType owner: Optional[str] generation: Optional[int] @@ -166,6 +223,9 @@ class Table(CommonProperty): view_definition: Optional[str] properties: Dict[str, str] upstreams: Dict[TableReference, Dict[str, List[str]]] = field(default_factory=dict) + external_upstreams: Set[ExternalTableReference] = field(default_factory=set) + upstream_notebooks: Set[NotebookId] = field(default_factory=set) + downstream_notebooks: Set[NotebookId] = field(default_factory=set) ref: TableReference = field(init=False) @@ -228,3 +288,23 @@ def __bool__(self): self.max is not None, ) ) + + +@dataclass +class Notebook: + id: NotebookId + path: str + language: Language + created_at: datetime + modified_at: datetime + + upstreams: FrozenSet[TableReference] = field(default_factory=frozenset) + + @classmethod + def add_upstream(cls, upstream: TableReference, notebook: "Notebook") -> "Notebook": + return cls( + **{ # type: ignore + **dataclasses.asdict(notebook), + "upstreams": frozenset([*notebook.upstreams, upstream]), + } + ) diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py index 8382b31a56add6..4153d9dd88eb86 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/report.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/report.py @@ -5,21 +5,27 @@ from datahub.ingestion.source.state.stale_entity_removal_handler import ( StaleEntityRemovalSourceReport, ) +from datahub.ingestion.source_report.ingestion_stage import IngestionStageReport from datahub.utilities.lossy_collections import LossyDict, LossyList @dataclass -class UnityCatalogReport(StaleEntityRemovalSourceReport): +class UnityCatalogReport(IngestionStageReport, StaleEntityRemovalSourceReport): metastores: EntityFilterReport = EntityFilterReport.field(type="metastore") catalogs: EntityFilterReport = EntityFilterReport.field(type="catalog") schemas: EntityFilterReport = EntityFilterReport.field(type="schema") tables: EntityFilterReport = EntityFilterReport.field(type="table/view") table_profiles: EntityFilterReport = EntityFilterReport.field(type="table profile") + notebooks: EntityFilterReport = EntityFilterReport.field(type="notebook") + + num_column_lineage_skipped_column_count: int = 0 + num_external_upstreams_lacking_permissions: int = 0 + num_external_upstreams_unsupported: int = 0 num_queries: int = 0 num_queries_dropped_parse_failure: int = 0 - num_queries_dropped_missing_table: int = 0 # Can be due to pattern filter - num_queries_dropped_duplicate_table: int = 0 + num_queries_missing_table: int = 0 # Can be due to pattern filter + num_queries_duplicate_table: int = 0 num_queries_parsed_by_spark_plan: int = 0 # Distinguish from Operations emitted for created / updated timestamps diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py index 493acb939c3bb4..b63cf65d55dc87 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/source.py @@ -1,8 +1,9 @@ import logging import re import time +from concurrent.futures import ThreadPoolExecutor from datetime import timedelta -from typing import Dict, Iterable, List, Optional, Set +from typing import Dict, Iterable, List, Optional, Set, Union from urllib.parse import urljoin from datahub.emitter.mce_builder import ( @@ -16,9 +17,12 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import ( CatalogKey, + CatalogKeyWithMetastore, ContainerKey, MetastoreKey, + NotebookKey, UnitySchemaKey, + UnitySchemaKeyWithMetastore, add_dataset_to_container, gen_containers, ) @@ -37,6 +41,7 @@ TestConnectionReport, ) from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.source.aws.s3_util import make_s3_urn_for_lineage from datahub.ingestion.source.common.subtypes import ( DatasetContainerSubTypes, DatasetSubTypes, @@ -56,6 +61,8 @@ Catalog, Column, Metastore, + Notebook, + NotebookId, Schema, ServicePrincipal, Table, @@ -69,6 +76,7 @@ ViewProperties, ) from datahub.metadata.schema_classes import ( + BrowsePathsClass, DataPlatformInstanceClass, DatasetLineageTypeClass, DatasetPropertiesClass, @@ -88,6 +96,7 @@ UpstreamClass, UpstreamLineageClass, ) +from datahub.utilities.file_backed_collections import FileBackedDict from datahub.utilities.hive_schema_to_avro import get_schema_fields_for_hive_column from datahub.utilities.registries.domain_registry import DomainRegistry @@ -122,7 +131,7 @@ class UnityCatalogSource(StatefulIngestionSourceBase, TestableSource): config: UnityCatalogSourceConfig unity_catalog_api_proxy: UnityCatalogApiProxy platform: str = "databricks" - platform_instance_name: str + platform_instance_name: Optional[str] def get_report(self) -> UnityCatalogReport: return self.report @@ -141,11 +150,13 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig): self.external_url_base = urljoin(self.config.workspace_url, "/explore/data") # Determine the platform_instance_name - self.platform_instance_name = ( - config.workspace_name - if config.workspace_name is not None - else config.workspace_url.split("//")[1].split(".")[0] - ) + self.platform_instance_name = self.config.platform_instance + if self.config.include_metastore: + self.platform_instance_name = ( + config.workspace_name + if config.workspace_name is not None + else config.workspace_url.split("//")[1].split(".")[0] + ) if self.config.domain: self.domain_registry = DomainRegistry( @@ -157,6 +168,7 @@ def __init__(self, ctx: PipelineContext, config: UnityCatalogSourceConfig): # Global set of table refs self.table_refs: Set[TableReference] = set() self.view_refs: Set[TableReference] = set() + self.notebooks: FileBackedDict[Notebook] = FileBackedDict() @staticmethod def test_connection(config_dict: dict) -> TestConnectionReport: @@ -176,6 +188,7 @@ def get_workunit_processors(self) -> List[Optional[MetadataWorkUnitProcessor]]: ] def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: + self.report.report_ingestion_stage_start("Start warehouse") wait_on_warehouse = None if self.config.is_profiling_enabled(): # Can take several minutes, so start now and wait later @@ -187,10 +200,23 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) return + self.report.report_ingestion_stage_start("Ingest service principals") self.build_service_principal_map() + if self.config.include_notebooks: + self.report.report_ingestion_stage_start("Ingest notebooks") + yield from self.process_notebooks() + yield from self.process_metastores() + if self.config.include_notebooks: + self.report.report_ingestion_stage_start("Notebook lineage") + for notebook in self.notebooks.values(): + wu = self._gen_notebook_lineage(notebook) + if wu: + yield wu + if self.config.include_usage_statistics: + self.report.report_ingestion_stage_start("Ingest usage") usage_extractor = UnityCatalogUsageExtractor( config=self.config, report=self.report, @@ -203,6 +229,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: ) if self.config.is_profiling_enabled(): + self.report.report_ingestion_stage_start("Wait on warehouse") assert wait_on_warehouse timeout = timedelta(seconds=self.config.profiling.max_wait_secs) wait_on_warehouse.result(timeout) @@ -212,6 +239,7 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]: self.unity_catalog_api_proxy, self.gen_dataset_urn, ) + self.report.report_ingestion_stage_start("Profiling") yield from profiling_extractor.get_workunits(self.table_refs) def build_service_principal_map(self) -> None: @@ -223,14 +251,72 @@ def build_service_principal_map(self) -> None: "service-principals", f"Unable to fetch service principals: {e}" ) + def process_notebooks(self) -> Iterable[MetadataWorkUnit]: + for notebook in self.unity_catalog_api_proxy.workspace_notebooks(): + if not self.config.notebook_pattern.allowed(notebook.path): + self.report.notebooks.dropped(notebook.path) + continue + + self.notebooks[str(notebook.id)] = notebook + yield from self._gen_notebook_workunits(notebook) + + def _gen_notebook_workunits(self, notebook: Notebook) -> Iterable[MetadataWorkUnit]: + mcps = MetadataChangeProposalWrapper.construct_many( + entityUrn=self.gen_notebook_urn(notebook), + aspects=[ + DatasetPropertiesClass( + name=notebook.path.rsplit("/", 1)[-1], + customProperties={ + "path": notebook.path, + "language": notebook.language.value, + }, + externalUrl=urljoin( + self.config.workspace_url, f"#notebook/{notebook.id}" + ), + created=TimeStampClass(int(notebook.created_at.timestamp() * 1000)), + lastModified=TimeStampClass( + int(notebook.modified_at.timestamp() * 1000) + ), + ), + SubTypesClass(typeNames=[DatasetSubTypes.NOTEBOOK]), + BrowsePathsClass(paths=notebook.path.split("/")), + self._create_data_platform_instance_aspect(), + ], + ) + for mcp in mcps: + yield mcp.as_workunit() + + self.report.notebooks.processed(notebook.path) + + def _gen_notebook_lineage(self, notebook: Notebook) -> Optional[MetadataWorkUnit]: + if not notebook.upstreams: + return None + + return MetadataChangeProposalWrapper( + entityUrn=self.gen_notebook_urn(notebook), + aspect=UpstreamLineageClass( + upstreams=[ + UpstreamClass( + dataset=self.gen_dataset_urn(upstream_ref), + type=DatasetLineageTypeClass.COPY, + ) + for upstream_ref in notebook.upstreams + ] + ), + ).as_workunit() + def process_metastores(self) -> Iterable[MetadataWorkUnit]: - metastore = self.unity_catalog_api_proxy.assigned_metastore() - yield from self.gen_metastore_containers(metastore) + metastore: Optional[Metastore] = None + if self.config.include_metastore: + metastore = self.unity_catalog_api_proxy.assigned_metastore() + yield from self.gen_metastore_containers(metastore) yield from self.process_catalogs(metastore) + if metastore and self.config.include_metastore: + self.report.metastores.processed(metastore.id) - self.report.metastores.processed(metastore.id) - - def process_catalogs(self, metastore: Metastore) -> Iterable[MetadataWorkUnit]: + def process_catalogs( + self, metastore: Optional[Metastore] + ) -> Iterable[MetadataWorkUnit]: for catalog in self.unity_catalog_api_proxy.catalogs(metastore=metastore): if not self.config.catalog_pattern.allowed(catalog.id): self.report.catalogs.dropped(catalog.id) @@ -247,6 +333,7 @@ def process_schemas(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]: self.report.schemas.dropped(schema.id) continue + self.report.report_ingestion_stage_start(f"Ingest schema {schema.id}") yield from self.gen_schema_containers(schema) yield from self.process_tables(schema) @@ -280,15 +367,15 @@ def process_table(self, table: Table, schema: Schema) -> Iterable[MetadataWorkUn operation = self._create_table_operation_aspect(table) domain = self._get_domain_aspect(dataset_name=table.ref.qualified_table_name) ownership = self._create_table_ownership_aspect(table) - data_platform_instance = self._create_data_platform_instance_aspect(table) + data_platform_instance = self._create_data_platform_instance_aspect() + + lineage = self.ingest_lineage(table) - lineage: Optional[UpstreamLineageClass] = None - if self.config.include_column_lineage: - self.unity_catalog_api_proxy.get_column_lineage(table) - lineage = self._generate_column_lineage_aspect(dataset_urn, table) - elif self.config.include_table_lineage: - self.unity_catalog_api_proxy.table_lineage(table) - lineage = self._generate_lineage_aspect(dataset_urn, table) + if self.config.include_notebooks: + for notebook_id in table.downstream_notebooks: + self.notebooks[str(notebook_id)] = Notebook.add_upstream( + table.ref, self.notebooks[str(notebook_id)] + ) yield from [ mcp.as_workunit() @@ -308,7 +395,29 @@ def process_table(self, table: Table, schema: Schema) -> Iterable[MetadataWorkUn ) ] - def _generate_column_lineage_aspect( + def ingest_lineage(self, table: Table) -> Optional[UpstreamLineageClass]: + if self.config.include_table_lineage: + self.unity_catalog_api_proxy.table_lineage( + table, include_entity_lineage=self.config.include_notebooks + ) + + if self.config.include_column_lineage and table.upstreams: + if len(table.columns) > self.config.column_lineage_column_limit: + self.report.num_column_lineage_skipped_column_count += 1 + + with ThreadPoolExecutor( + max_workers=self.config.lineage_max_workers + ) as executor: + for column in table.columns[: self.config.column_lineage_column_limit]: + executor.submit( + self.unity_catalog_api_proxy.get_column_lineage, + table, + column.name, + ) + + return self._generate_lineage_aspect(self.gen_dataset_urn(table.ref), table) + + def _generate_lineage_aspect( self, dataset_urn: str, table: Table ) -> Optional[UpstreamLineageClass]: upstreams: List[UpstreamClass] = [] @@ -318,6 +427,7 @@ def _generate_column_lineage_aspect( ): upstream_urn = self.gen_dataset_urn(upstream_ref) + # Should be empty if config.include_column_lineage is False finegrained_lineages.extend( FineGrainedLineage( upstreamType=FineGrainedLineageUpstreamType.FIELD_SET, @@ -331,38 +441,50 @@ def _generate_column_lineage_aspect( for d_col, u_cols in sorted(downstream_to_upstream_cols.items()) ) - upstream_table = UpstreamClass( - upstream_urn, - DatasetLineageTypeClass.TRANSFORMED, - ) - upstreams.append(upstream_table) - - if upstreams: - return UpstreamLineageClass( - upstreams=upstreams, fineGrainedLineages=finegrained_lineages + upstreams.append( + UpstreamClass( + dataset=upstream_urn, + type=DatasetLineageTypeClass.TRANSFORMED, + ) ) - else: - return None - def _generate_lineage_aspect( - self, dataset_urn: str, table: Table - ) -> Optional[UpstreamLineageClass]: - upstreams: List[UpstreamClass] = [] - for upstream in sorted(table.upstreams.keys()): - upstream_urn = make_dataset_urn_with_platform_instance( - self.platform, - f"{table.schema.catalog.metastore.id}.{upstream}", - self.platform_instance_name, + for notebook in table.upstream_notebooks: + upstreams.append( + UpstreamClass( + dataset=self.gen_notebook_urn(notebook), + type=DatasetLineageTypeClass.TRANSFORMED, + ) ) - upstream_table = UpstreamClass( - upstream_urn, - DatasetLineageTypeClass.TRANSFORMED, - ) - upstreams.append(upstream_table) + if self.config.include_external_lineage: + for external_ref in table.external_upstreams: + if not external_ref.has_permission or not external_ref.path: + self.report.num_external_upstreams_lacking_permissions += 1 + logger.warning( + f"Lacking permissions for external file upstream on {table.ref}" + ) + elif external_ref.path.startswith("s3://"): + upstreams.append( + UpstreamClass( + dataset=make_s3_urn_for_lineage( + external_ref.path, self.config.env + ), + type=DatasetLineageTypeClass.COPY, + ) + ) + else: + self.report.num_external_upstreams_unsupported += 1 + logger.warning( + f"Unsupported external file upstream on {table.ref}: {external_ref.path}" + ) if upstreams: - return UpstreamLineageClass(upstreams=upstreams) + return UpstreamLineageClass( + upstreams=upstreams, + fineGrainedLineages=finegrained_lineages + if self.config.include_column_lineage + else None, + ) else: return None @@ -389,6 +511,14 @@ def gen_dataset_urn(self, table_ref: TableReference) -> str: name=str(table_ref), ) + def gen_notebook_urn(self, notebook: Union[Notebook, NotebookId]) -> str: + notebook_id = notebook.id if isinstance(notebook, Notebook) else notebook + return NotebookKey( + notebook_id=notebook_id, + platform=self.platform, + instance=self.config.platform_instance, + ).as_urn() + def gen_schema_containers(self, schema: Schema) -> Iterable[MetadataWorkUnit]: domain_urn = self._gen_domain_urn(f"{schema.catalog.name}.{schema.name}") @@ -423,27 +553,37 @@ def gen_metastore_containers( def gen_catalog_containers(self, catalog: Catalog) -> Iterable[MetadataWorkUnit]: domain_urn = self._gen_domain_urn(catalog.name) - metastore_container_key = self.gen_metastore_key(catalog.metastore) catalog_container_key = self.gen_catalog_key(catalog) yield from gen_containers( container_key=catalog_container_key, name=catalog.name, sub_types=[DatasetContainerSubTypes.CATALOG], domain_urn=domain_urn, - parent_container_key=metastore_container_key, + parent_container_key=self.gen_metastore_key(catalog.metastore) + if self.config.include_metastore and catalog.metastore + else None, description=catalog.comment, owner_urn=self.get_owner_urn(catalog.owner), external_url=f"{self.external_url_base}/{catalog.name}", ) def gen_schema_key(self, schema: Schema) -> ContainerKey: - return UnitySchemaKey( - unity_schema=schema.name, - platform=self.platform, - instance=self.config.platform_instance, - catalog=schema.catalog.name, - metastore=schema.catalog.metastore.name, - ) + if self.config.include_metastore: + assert schema.catalog.metastore + return UnitySchemaKeyWithMetastore( + unity_schema=schema.name, + platform=self.platform, + instance=self.config.platform_instance, + catalog=schema.catalog.name, + metastore=schema.catalog.metastore.name, + ) + else: + return UnitySchemaKey( + unity_schema=schema.name, + platform=self.platform, + instance=self.config.platform_instance, + catalog=schema.catalog.name, + ) def gen_metastore_key(self, metastore: Metastore) -> MetastoreKey: return MetastoreKey( @@ -452,13 +592,21 @@ def gen_metastore_key(self, metastore: Metastore) -> MetastoreKey: instance=self.config.platform_instance, ) - def gen_catalog_key(self, catalog: Catalog) -> CatalogKey: - return CatalogKey( - catalog=catalog.name, - metastore=catalog.metastore.name, - platform=self.platform, - instance=self.config.platform_instance, - ) + def gen_catalog_key(self, catalog: Catalog) -> ContainerKey: + if self.config.include_metastore: + assert catalog.metastore + return CatalogKeyWithMetastore( + catalog=catalog.name, + metastore=catalog.metastore.name, + platform=self.platform, + instance=self.config.platform_instance, + ) + else: + return CatalogKey( + catalog=catalog.name, + platform=self.platform, + instance=self.config.platform_instance, + ) def _gen_domain_urn(self, dataset_name: str) -> Optional[str]: domain_urn: Optional[str] = None @@ -563,15 +711,16 @@ def _create_table_ownership_aspect(self, table: Table) -> Optional[OwnershipClas return None def _create_data_platform_instance_aspect( - self, table: Table + self, ) -> Optional[DataPlatformInstanceClass]: - # Only ingest the DPI aspect if the flag is true if self.config.ingest_data_platform_instance_aspect: return DataPlatformInstanceClass( platform=make_data_platform_urn(self.platform), instance=make_dataplatform_instance_urn( self.platform, self.platform_instance_name - ), + ) + if self.platform_instance_name + else None, ) return None diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py index 49f56b46fb0121..ab21c1a3186596 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py @@ -214,12 +214,15 @@ def _resolve_tables( self, tables: List[str], table_map: TableMap ) -> List[TableReference]: """Resolve tables to TableReferences, filtering out unrecognized or unresolvable table names.""" + + missing_table = False + duplicate_table = False output = [] for table in tables: table = str(table) if table not in table_map: logger.debug(f"Dropping query with unrecognized table: {table}") - self.report.num_queries_dropped_missing_table += 1 + missing_table = True else: refs = table_map[table] if len(refs) == 1: @@ -228,6 +231,11 @@ def _resolve_tables( logger.warning( f"Could not resolve table ref for {table}: {len(refs)} duplicates." ) - self.report.num_queries_dropped_duplicate_table += 1 + duplicate_table = True + + if missing_table: + self.report.num_queries_missing_table += 1 + if duplicate_table: + self.report.num_queries_duplicate_table += 1 return output diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source_config/bigquery.py index 8ca1296d819c1c..0a73bb5203e72b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/bigquery.py @@ -4,7 +4,13 @@ from datahub.configuration.common import ConfigModel, ConfigurationError -_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: str = "((.+)[_$])?(\\d{8})$" +# Regexp for sharded tables. +# A sharded table is a table that has a suffix of the form _yyyymmdd or yyyymmdd, where yyyymmdd is a date. +# The regexp checks for valid dates in the suffix (e.g. 20200101, 20200229, 20201231) and if the date is not valid +# then it is not a sharded table. +_BIGQUERY_DEFAULT_SHARDED_TABLE_REGEX: str = ( + "((.+\\D)[_$]?)?(\\d\\d\\d\\d(?:0[1-9]|1[0-2])(?:0[1-9]|[12][0-9]|3[01]))$" +) class BigQueryBaseConfig(ConfigModel): diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 0d72fc52da0cab..c3e8c175f1de54 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -166,13 +166,17 @@ def _check_oauth_config(oauth_config: Optional[OAuthConfiguration]) -> None: "but should be set when using use_certificate false for oauth_config" ) - @pydantic.validator("include_view_lineage") - def validate_include_view_lineage(cls, v, values): - if not values.get("include_table_lineage") and v: + @pydantic.root_validator() + def validate_include_view_lineage(cls, values): + if ( + "include_table_lineage" in values + and not values.get("include_table_lineage") + and values.get("include_view_lineage") + ): raise ValueError( "include_table_lineage must be True for include_view_lineage to be set." ) - return v + return values def get_sql_alchemy_url( self, diff --git a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py index eabf62a4cda2b8..8b393a8f6f1c68 100644 --- a/metadata-ingestion/src/datahub/integrations/great_expectations/action.py +++ b/metadata-ingestion/src/datahub/integrations/great_expectations/action.py @@ -35,7 +35,10 @@ from datahub.cli.cli_utils import get_boolean_env_variable from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.ingestion.source.sql.sql_common import get_platform_from_sqlalchemy_uri +from datahub.emitter.serialization_helper import pre_json_transform +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( + get_platform_from_sqlalchemy_uri, +) from datahub.metadata.com.linkedin.pegasus2avro.assertion import ( AssertionInfo, AssertionResult, @@ -251,13 +254,15 @@ def get_assertions_with_results( # possibly for each validation run assertionUrn = builder.make_assertion_urn( builder.datahub_guid( - { - "platform": GE_PLATFORM_NAME, - "nativeType": expectation_type, - "nativeParameters": kwargs, - "dataset": assertion_datasets[0], - "fields": assertion_fields, - } + pre_json_transform( + { + "platform": GE_PLATFORM_NAME, + "nativeType": expectation_type, + "nativeParameters": kwargs, + "dataset": assertion_datasets[0], + "fields": assertion_fields, + } + ) ) ) logger.debug( @@ -636,7 +641,7 @@ def get_dataset_partitions(self, batch_identifier, data_asset): ].batch_request.runtime_parameters["query"] partitionSpec = PartitionSpecClass( type=PartitionTypeClass.QUERY, - partition=f"Query_{builder.datahub_guid(query)}", + partition=f"Query_{builder.datahub_guid(pre_json_transform(query))}", ) batchSpec = BatchSpec( diff --git a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py index 8516a7054a9cdc..2b610947e9043c 100644 --- a/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py +++ b/metadata-ingestion/src/datahub/testing/check_sql_parser_result.py @@ -24,6 +24,7 @@ def assert_sql_result_with_resolver( *, expected_file: pathlib.Path, schema_resolver: SchemaResolver, + allow_table_error: bool = False, **kwargs: Any, ) -> None: # HACK: Our BigQuery source overwrites this value and doesn't undo it. @@ -36,6 +37,14 @@ def assert_sql_result_with_resolver( **kwargs, ) + if res.debug_info.table_error: + if allow_table_error: + logger.info( + f"SQL parser table error: {res.debug_info.table_error}", + exc_info=res.debug_info.table_error, + ) + else: + raise res.debug_info.table_error if res.debug_info.column_error: logger.warning( f"SQL parser column error: {res.debug_info.column_error}", @@ -70,11 +79,14 @@ def assert_sql_result( sql: str, *, dialect: str, + platform_instance: Optional[str] = None, expected_file: pathlib.Path, schemas: Optional[Dict[str, SchemaInfo]] = None, **kwargs: Any, ) -> None: - schema_resolver = SchemaResolver(platform=dialect) + schema_resolver = SchemaResolver( + platform=dialect, platform_instance=platform_instance + ) if schemas: for urn, schema in schemas.items(): schema_resolver.add_raw_schema_info(urn, schema) diff --git a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py index 5c52e1ab4f0b37..54f6a6e984c00d 100644 --- a/metadata-ingestion/src/datahub/testing/compare_metadata_json.py +++ b/metadata-ingestion/src/datahub/testing/compare_metadata_json.py @@ -40,6 +40,7 @@ def assert_metadata_files_equal( update_golden: bool, copy_output: bool, ignore_paths: Sequence[str] = (), + ignore_order: bool = True, ) -> None: golden_exists = os.path.isfile(golden_path) @@ -65,7 +66,7 @@ def assert_metadata_files_equal( write_metadata_file(pathlib.Path(temp.name), golden_metadata) golden = load_json_file(temp.name) - diff = diff_metadata_json(output, golden, ignore_paths) + diff = diff_metadata_json(output, golden, ignore_paths, ignore_order=ignore_order) if diff and update_golden: if isinstance(diff, MCPDiff): diff.apply_delta(golden) @@ -91,16 +92,19 @@ def diff_metadata_json( output: MetadataJson, golden: MetadataJson, ignore_paths: Sequence[str] = (), + ignore_order: bool = True, ) -> Union[DeepDiff, MCPDiff]: ignore_paths = (*ignore_paths, *default_exclude_paths, r"root\[\d+].delta_info") try: - golden_map = get_aspects_by_urn(golden) - output_map = get_aspects_by_urn(output) - return MCPDiff.create( - golden=golden_map, - output=output_map, - ignore_paths=ignore_paths, - ) + if ignore_order: + golden_map = get_aspects_by_urn(golden) + output_map = get_aspects_by_urn(output) + return MCPDiff.create( + golden=golden_map, + output=output_map, + ignore_paths=ignore_paths, + ) + # if ignore_order is False, always use DeepDiff except CannotCompareMCPs as e: logger.info(f"{e}, falling back to MCE diff") except AssertionError as e: @@ -111,5 +115,5 @@ def diff_metadata_json( golden, output, exclude_regex_paths=ignore_paths, - ignore_order=True, + ignore_order=ignore_order, ) diff --git a/metadata-ingestion/src/datahub/upgrade/upgrade.py b/metadata-ingestion/src/datahub/upgrade/upgrade.py index 30f19b8b84f354..acc7954ad25a63 100644 --- a/metadata-ingestion/src/datahub/upgrade/upgrade.py +++ b/metadata-ingestion/src/datahub/upgrade/upgrade.py @@ -1,6 +1,5 @@ import asyncio import contextlib -import functools import logging import sys from datetime import datetime, timedelta, timezone @@ -374,17 +373,14 @@ def check_upgrade(func: Callable[..., T]) -> Callable[..., T]: @wraps(func) def async_wrapper(*args: Any, **kwargs: Any) -> Any: async def run_inner_func(): - loop = asyncio.get_event_loop() - return await loop.run_in_executor( - None, functools.partial(func, *args, **kwargs) - ) + return func(*args, **kwargs) async def run_func_check_upgrade(): version_stats_future = asyncio.ensure_future(retrieve_version_stats()) - the_one_future = asyncio.ensure_future(run_inner_func()) - ret = await the_one_future + main_func_future = asyncio.ensure_future(run_inner_func()) + ret = await main_func_future - # the one future has returned + # the main future has returned # we check the other futures quickly try: version_stats = await asyncio.wait_for(version_stats_future, 0.5) diff --git a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py index c04d2138bc1161..18493edded4b7d 100644 --- a/metadata-ingestion/src/datahub/utilities/file_backed_collections.py +++ b/metadata-ingestion/src/datahub/utilities/file_backed_collections.py @@ -3,6 +3,7 @@ import logging import pathlib import pickle +import shutil import sqlite3 import tempfile from dataclasses import dataclass, field @@ -56,15 +57,15 @@ class ConnectionWrapper: conn: sqlite3.Connection filename: pathlib.Path - _temp_directory: Optional[tempfile.TemporaryDirectory] + _temp_directory: Optional[str] def __init__(self, filename: Optional[pathlib.Path] = None): self._temp_directory = None # Warning: If filename is provided, the file will not be automatically cleaned up. if not filename: - self._temp_directory = tempfile.TemporaryDirectory() - filename = pathlib.Path(self._temp_directory.name) / _DEFAULT_FILE_NAME + self._temp_directory = tempfile.mkdtemp() + filename = pathlib.Path(self._temp_directory) / _DEFAULT_FILE_NAME self.conn = sqlite3.connect(filename, isolation_level=None) self.conn.row_factory = sqlite3.Row @@ -101,7 +102,8 @@ def executemany( def close(self) -> None: self.conn.close() if self._temp_directory: - self._temp_directory.cleanup() + shutil.rmtree(self._temp_directory) + self._temp_directory = None def __enter__(self) -> "ConnectionWrapper": return self diff --git a/metadata-ingestion/src/datahub/utilities/mapping.py b/metadata-ingestion/src/datahub/utilities/mapping.py index 793eccfb22c7e8..f91c01d901ac1e 100644 --- a/metadata-ingestion/src/datahub/utilities/mapping.py +++ b/metadata-ingestion/src/datahub/utilities/mapping.py @@ -2,12 +2,16 @@ import logging import operator import re +import time from functools import reduce -from typing import Any, Dict, List, Match, Optional, Union +from typing import Any, Dict, List, Mapping, Match, Optional, Union, cast from datahub.emitter import mce_builder from datahub.emitter.mce_builder import OwnerType from datahub.metadata.schema_classes import ( + AuditStampClass, + InstitutionalMemoryClass, + InstitutionalMemoryMetadataClass, OwnerClass, OwnershipClass, OwnershipSourceClass, @@ -39,6 +43,7 @@ def _insert_match_value(original_value: str, match_value: str) -> str: class Constants: + ADD_DOC_LINK_OPERATION = "add_doc_link" ADD_TAG_OPERATION = "add_tag" ADD_TERM_OPERATION = "add_term" ADD_TERMS_OPERATION = "add_terms" @@ -47,6 +52,8 @@ class Constants: OPERATION_CONFIG = "config" TAG = "tag" TERM = "term" + DOC_LINK = "link" + DOC_DESCRIPTION = "description" OWNER_TYPE = "owner_type" OWNER_CATEGORY = "owner_category" MATCH = "match" @@ -104,7 +111,7 @@ def __init__( self.owner_source_type = owner_source_type self.match_nested_props = match_nested_props - def process(self, raw_props: Dict[str, Any]) -> Dict[str, Any]: + def process(self, raw_props: Mapping[str, Any]) -> Dict[str, Any]: # Defining the following local variables - # operations_map - the final resulting map when operations are processed. # Against each operation the values to be applied are stored. @@ -163,7 +170,6 @@ def process(self, raw_props: Dict[str, Any]) -> Dict[str, Any]: ) operations_value_list.append(operation) # type: ignore operations_map[operation_type] = operations_value_list - aspect_map = self.convert_to_aspects(operations_map) except Exception as e: logger.error(f"Error while processing operation defs over raw_props: {e}") @@ -173,6 +179,7 @@ def convert_to_aspects( self, operation_map: Dict[str, Union[set, list]] ) -> Dict[str, Any]: aspect_map: Dict[str, Any] = {} + if Constants.ADD_TAG_OPERATION in operation_map: tag_aspect = mce_builder.make_global_tag_aspect_with_tag_list( sorted(operation_map[Constants.ADD_TAG_OPERATION]) @@ -195,11 +202,57 @@ def convert_to_aspects( ] ) aspect_map[Constants.ADD_OWNER_OPERATION] = owner_aspect + if Constants.ADD_TERM_OPERATION in operation_map: term_aspect = mce_builder.make_glossary_terms_aspect_from_urn_list( sorted(operation_map[Constants.ADD_TERM_OPERATION]) ) aspect_map[Constants.ADD_TERM_OPERATION] = term_aspect + + if Constants.ADD_DOC_LINK_OPERATION in operation_map: + try: + if len( + operation_map[Constants.ADD_DOC_LINK_OPERATION] + ) == 1 and isinstance( + operation_map[Constants.ADD_DOC_LINK_OPERATION], list + ): + docs_dict = cast( + List[Dict], operation_map[Constants.ADD_DOC_LINK_OPERATION] + )[0] + if "description" not in docs_dict or "link" not in docs_dict: + raise Exception( + "Documentation_link meta_mapping config needs a description key and a link key" + ) + + now = int(time.time() * 1000) # milliseconds since epoch + institutional_memory_element = InstitutionalMemoryMetadataClass( + url=docs_dict["link"], + description=docs_dict["description"], + createStamp=AuditStampClass( + time=now, actor="urn:li:corpuser:ingestion" + ), + ) + + # create a new institutional memory aspect + institutional_memory_aspect = InstitutionalMemoryClass( + elements=[institutional_memory_element] + ) + + aspect_map[ + Constants.ADD_DOC_LINK_OPERATION + ] = institutional_memory_aspect + else: + raise Exception( + f"Expected 1 item of type list for the documentation_link meta_mapping config," + f" received type of {type(operation_map[Constants.ADD_DOC_LINK_OPERATION])}" + f", and size of {len(operation_map[Constants.ADD_DOC_LINK_OPERATION])}." + ) + + except Exception as e: + logger.error( + f"Error while constructing aspect for documentation link and description : {e}" + ) + return aspect_map def get_operation_value( @@ -248,6 +301,16 @@ def get_operation_value( term = operation_config[Constants.TERM] term = _insert_match_value(term, _get_best_match(match, "term")) return mce_builder.make_term_urn(term) + elif ( + operation_type == Constants.ADD_DOC_LINK_OPERATION + and operation_config[Constants.DOC_LINK] + and operation_config[Constants.DOC_DESCRIPTION] + ): + link = operation_config[Constants.DOC_LINK] + link = _insert_match_value(link, _get_best_match(match, "link")) + description = operation_config[Constants.DOC_DESCRIPTION] + return {"link": link, "description": description} + elif operation_type == Constants.ADD_TERMS_OPERATION: separator = operation_config.get(Constants.SEPARATOR, ",") captured_terms = match.group(0) diff --git a/metadata-ingestion/src/datahub/utilities/ratelimiter.py b/metadata-ingestion/src/datahub/utilities/ratelimiter.py new file mode 100644 index 00000000000000..3d47d25e14c492 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/ratelimiter.py @@ -0,0 +1,56 @@ +import collections +import threading +import time +from contextlib import AbstractContextManager +from typing import Any, Deque + + +# Modified version of https://github.com/RazerM/ratelimiter/blob/master/ratelimiter/_sync.py +class RateLimiter(AbstractContextManager): + + """Provides rate limiting for an operation with a configurable number of + requests for a time period. + """ + + def __init__(self, max_calls: int, period: float = 1.0) -> None: + """Initialize a RateLimiter object which enforces as much as max_calls + operations on period (eventually floating) number of seconds. + """ + if period <= 0: + raise ValueError("Rate limiting period should be > 0") + if max_calls <= 0: + raise ValueError("Rate limiting number of calls should be > 0") + + # We're using a deque to store the last execution timestamps, not for + # its maxlen attribute, but to allow constant time front removal. + self.calls: Deque = collections.deque() + + self.period = period + self.max_calls = max_calls + self._lock = threading.Lock() + + def __enter__(self) -> "RateLimiter": + with self._lock: + # We want to ensure that no more than max_calls were run in the allowed + # period. For this, we store the last timestamps of each call and run + # the rate verification upon each __enter__ call. + if len(self.calls) >= self.max_calls: + until = time.time() + self.period - self._timespan + sleeptime = until - time.time() + if sleeptime > 0: + time.sleep(sleeptime) + return self + + def __exit__(self, exc_type: Any, exc: Any, traceback: Any) -> None: + with self._lock: + # Store the last operation timestamp. + self.calls.append(time.time()) + + # Pop the timestamp list front (ie: the older calls) until the sum goes + # back below the period. This is our 'sliding period' window. + while self._timespan >= self.period: + self.calls.popleft() + + @property + def _timespan(self) -> float: + return self.calls[-1] - self.calls[0] diff --git a/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py new file mode 100644 index 00000000000000..5d2fc6872c7bd9 --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/sqlalchemy_type_converter.py @@ -0,0 +1,211 @@ +import json +import logging +import uuid +from typing import Any, Dict, List, Optional, Type, Union + +from sqlalchemy import types + +from datahub.ingestion.extractor.schema_util import avro_schema_to_mce_fields +from datahub.metadata.com.linkedin.pegasus2avro.schema import SchemaField +from datahub.metadata.schema_classes import NullTypeClass, SchemaFieldDataTypeClass + +logger = logging.getLogger(__name__) + +try: + # This is used for both BigQuery and Athena. + from sqlalchemy_bigquery import STRUCT +except ImportError: + STRUCT = None + + +class MapType(types.TupleType): + # Wrapper class around SQLalchemy's TupleType to increase compatibility with DataHub + pass + + +class SqlAlchemyColumnToAvroConverter: + """Helper class that collects some methods to convert SQLalchemy columns to Avro schema.""" + + # tuple of complex data types that require a special handling + _COMPLEX_TYPES = (STRUCT, types.ARRAY, MapType) + + # mapping of primitive SQLalchemy data types to AVRO schema data types + PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE: Dict[Type[types.TypeEngine], str] = { + types.String: "string", + types.BINARY: "string", + types.BOOLEAN: "boolean", + types.FLOAT: "float", + types.INTEGER: "int", + types.BIGINT: "long", + types.VARCHAR: "string", + types.CHAR: "string", + } + + @classmethod + def get_avro_type( + cls, column_type: Union[types.TypeEngine, STRUCT, MapType], nullable: bool + ) -> Dict[str, Any]: + """Determines the concrete AVRO schema type for a SQLalchemy-typed column""" + + if isinstance( + column_type, tuple(cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE.keys()) + ): + return { + "type": cls.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE[type(column_type)], + "native_data_type": str(column_type), + "_nullable": nullable, + } + if isinstance(column_type, types.DECIMAL): + return { + "type": "bytes", + "logicalType": "decimal", + "precision": int(column_type.precision), + "scale": int(column_type.scale), + "native_data_type": str(column_type), + "_nullable": nullable, + } + if isinstance(column_type, types.DATE): + return { + "type": "int", + "logicalType": "date", + "native_data_type": str(column_type), + "_nullable": nullable, + } + if isinstance(column_type, types.TIMESTAMP): + return { + "type": "long", + "logicalType": "timestamp-millis", + "native_data_type": str(column_type), + "_nullable": nullable, + } + if isinstance(column_type, types.ARRAY): + array_type = column_type.item_type + return { + "type": "array", + "items": cls.get_avro_type(column_type=array_type, nullable=nullable), + "native_data_type": f"array<{str(column_type.item_type)}>", + } + if isinstance(column_type, MapType): + key_type = column_type.types[0] + value_type = column_type.types[1] + return { + "type": "map", + "values": cls.get_avro_type(column_type=value_type, nullable=nullable), + "native_data_type": str(column_type), + "key_type": cls.get_avro_type(column_type=key_type, nullable=nullable), + "key_native_data_type": str(key_type), + } + if STRUCT and isinstance(column_type, STRUCT): + fields = [] + for field_def in column_type._STRUCT_fields: + field_name, field_type = field_def + fields.append( + { + "name": field_name, + "type": cls.get_avro_type( + column_type=field_type, nullable=nullable + ), + } + ) + struct_name = f"__struct_{str(uuid.uuid4()).replace('-', '')}" + + return { + "type": "record", + "name": struct_name, + "fields": fields, + "native_data_type": str(column_type), + "_nullable": nullable, + } + + return { + "type": "null", + "native_data_type": str(column_type), + "_nullable": nullable, + } + + @classmethod + def get_avro_for_sqlalchemy_column( + cls, + column_name: str, + column_type: types.TypeEngine, + nullable: bool, + ) -> Union[object, Dict[str, object]]: + """Returns the AVRO schema representation of a SQLalchemy column.""" + if isinstance(column_type, cls._COMPLEX_TYPES): + return { + "type": "record", + "name": "__struct_", + "fields": [ + { + "name": column_name, + "type": cls.get_avro_type( + column_type=column_type, nullable=nullable + ), + } + ], + } + return cls.get_avro_type(column_type=column_type, nullable=nullable) + + +def get_schema_fields_for_sqlalchemy_column( + column_name: str, + column_type: types.TypeEngine, + description: Optional[str] = None, + nullable: Optional[bool] = True, + is_part_of_key: Optional[bool] = False, +) -> List[SchemaField]: + """Creates SchemaFields from a given SQLalchemy column. + + This function is analogous to `get_schema_fields_for_hive_column` from datahub.utilities.hive_schema_to_avro. + The main purpose of implementing it this way, is to make it ready/compatible for second field path generation, + which allows to explore nested structures within the UI. + """ + + if nullable is None: + nullable = True + + try: + # as a first step, the column is converted to AVRO JSON which can then be used by an existing function + avro_schema_json = ( + SqlAlchemyColumnToAvroConverter.get_avro_for_sqlalchemy_column( + column_name=column_name, + column_type=column_type, + nullable=nullable, + ) + ) + # retrieve schema field definitions from the above generated AVRO JSON structure + schema_fields = avro_schema_to_mce_fields( + avro_schema=json.dumps(avro_schema_json), + default_nullable=nullable, + swallow_exceptions=False, + ) + except Exception as e: + logger.warning( + f"Unable to parse column {column_name} and type {column_type} the error was: {e}" + ) + + # fallback description in case any exception occurred + schema_fields = [ + SchemaField( + fieldPath=column_name, + type=SchemaFieldDataTypeClass(type=NullTypeClass()), + nativeDataType=str(column_type), + ) + ] + + # for all non-nested data types an additional modification of the `fieldPath` property is required + if type(column_type) in ( + *SqlAlchemyColumnToAvroConverter.PRIMITIVE_SQL_ALCHEMY_TYPE_TO_AVRO_TYPE.keys(), + types.TIMESTAMP, + types.DATE, + types.DECIMAL, + ): + schema_fields[0].fieldPath += f".{column_name}" + + if description: + schema_fields[0].description = description + schema_fields[0].isPartOfKey = ( + is_part_of_key if is_part_of_key is not None else False + ) + + return schema_fields diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index f18235af3d1fd5..6413275ac63a6f 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -5,14 +5,15 @@ import logging import pathlib from collections import defaultdict -from typing import Dict, List, Optional, Set, Tuple, Union +from typing import Any, Dict, List, Optional, Set, Tuple, Union import pydantic.dataclasses import sqlglot import sqlglot.errors import sqlglot.lineage +import sqlglot.optimizer.annotate_types +import sqlglot.optimizer.optimizer import sqlglot.optimizer.qualify -import sqlglot.optimizer.qualify_columns from pydantic import BaseModel from typing_extensions import TypedDict @@ -23,7 +24,17 @@ from datahub.ingestion.api.closeable import Closeable from datahub.ingestion.graph.client import DataHubGraph from datahub.ingestion.source.bigquery_v2.bigquery_audit import BigqueryTableIdentifier -from datahub.metadata.schema_classes import OperationTypeClass, SchemaMetadataClass +from datahub.metadata.schema_classes import ( + ArrayTypeClass, + BooleanTypeClass, + DateTypeClass, + NumberTypeClass, + OperationTypeClass, + SchemaFieldDataTypeClass, + SchemaMetadataClass, + StringTypeClass, + TimeTypeClass, +) from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict from datahub.utilities.urns.dataset_urn import DatasetUrn @@ -37,6 +48,19 @@ SQL_PARSE_RESULT_CACHE_SIZE = 1000 +RULES_BEFORE_TYPE_ANNOTATION: tuple = tuple( + filter( + # Skip pushdown_predicates because it sometimes throws exceptions, and we + # don't actually need it for anything. + lambda func: func.__name__ not in {"pushdown_predicates"}, + itertools.takewhile( + lambda func: func != sqlglot.optimizer.annotate_types.annotate_types, + sqlglot.optimizer.optimizer.RULES, + ), + ) +) + + class GraphQLSchemaField(TypedDict): fieldPath: str nativeDataType: str @@ -82,6 +106,7 @@ def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType: sqlglot.exp.Update: QueryType.UPDATE, sqlglot.exp.Delete: QueryType.DELETE, sqlglot.exp.Merge: QueryType.MERGE, + sqlglot.exp.Subqueryable: QueryType.SELECT, # unions, etc. are also selects } for cls, query_type in mapping.items(): @@ -90,8 +115,18 @@ def get_query_type_of_sql(expression: sqlglot.exp.Expression) -> QueryType: return QueryType.UNKNOWN +class _ParserBaseModel( + BaseModel, + arbitrary_types_allowed=True, + json_encoders={ + SchemaFieldDataTypeClass: lambda v: v.to_obj(), + }, +): + pass + + @functools.total_ordering -class _FrozenModel(BaseModel, frozen=True): +class _FrozenModel(_ParserBaseModel, frozen=True): def __lt__(self, other: "_FrozenModel") -> bool: for field in self.__fields__: self_v = getattr(self, field) @@ -146,29 +181,42 @@ class _ColumnRef(_FrozenModel): column: str -class ColumnRef(BaseModel): +class ColumnRef(_ParserBaseModel): table: Urn column: str -class _DownstreamColumnRef(BaseModel): +class _DownstreamColumnRef(_ParserBaseModel): table: Optional[_TableName] column: str + column_type: Optional[sqlglot.exp.DataType] -class DownstreamColumnRef(BaseModel): +class DownstreamColumnRef(_ParserBaseModel): table: Optional[Urn] column: str + column_type: Optional[SchemaFieldDataTypeClass] + native_column_type: Optional[str] + + @pydantic.validator("column_type", pre=True) + def _load_column_type( + cls, v: Optional[Union[dict, SchemaFieldDataTypeClass]] + ) -> Optional[SchemaFieldDataTypeClass]: + if v is None: + return None + if isinstance(v, SchemaFieldDataTypeClass): + return v + return SchemaFieldDataTypeClass.from_obj(v) -class _ColumnLineageInfo(BaseModel): +class _ColumnLineageInfo(_ParserBaseModel): downstream: _DownstreamColumnRef upstreams: List[_ColumnRef] logic: Optional[str] -class ColumnLineageInfo(BaseModel): +class ColumnLineageInfo(_ParserBaseModel): downstream: DownstreamColumnRef upstreams: List[ColumnRef] @@ -176,7 +224,7 @@ class ColumnLineageInfo(BaseModel): logic: Optional[str] = pydantic.Field(default=None, exclude=True) -class SqlParsingDebugInfo(BaseModel, arbitrary_types_allowed=True): +class SqlParsingDebugInfo(_ParserBaseModel): confidence: float = 0.0 tables_discovered: int = 0 @@ -190,7 +238,7 @@ def error(self) -> Optional[Exception]: return self.table_error or self.column_error -class SqlParsingResult(BaseModel): +class SqlParsingResult(_ParserBaseModel): query_type: QueryType = QueryType.UNKNOWN in_tables: List[Urn] @@ -207,9 +255,9 @@ class SqlParsingResult(BaseModel): ) -def _parse_statement(sql: str, dialect: str) -> sqlglot.Expression: - statement = sqlglot.parse_one( - sql, read=dialect, error_level=sqlglot.ErrorLevel.RAISE +def _parse_statement(sql: sqlglot.exp.ExpOrStr, dialect: str) -> sqlglot.Expression: + statement: sqlglot.Expression = sqlglot.maybe_parse( + sql, dialect=dialect, error_level=sqlglot.ErrorLevel.RAISE ) return statement @@ -231,6 +279,13 @@ def _table_level_lineage( # In some cases like "MERGE ... then INSERT (col1, col2) VALUES (col1, col2)", # the `this` on the INSERT part isn't a table. if isinstance(expr.this, sqlglot.exp.Table) + } | { + # For CREATE DDL statements, the table name is nested inside + # a Schema object. + _TableName.from_sqlglot_table(expr.this.this) + for expr in statement.find_all(sqlglot.exp.Create) + if isinstance(expr.this, sqlglot.exp.Schema) + and isinstance(expr.this.this, sqlglot.exp.Table) } tables = ( @@ -242,12 +297,16 @@ def _table_level_lineage( - modified # ignore CTEs created in this statement - { - _TableName(database=None, schema=None, table=cte.alias_or_name) + _TableName(database=None, db_schema=None, table=cte.alias_or_name) for cte in statement.find_all(sqlglot.exp.CTE) } ) # TODO: If a CTAS has "LIMIT 0", it's not really lineage, just copying the schema. + # Update statements implicitly read from the table being updated, so add those back in. + if isinstance(statement, sqlglot.exp.Update): + tables = tables | modified + return tables, modified @@ -276,6 +335,9 @@ def __init__( shared_connection=shared_conn, ) + def get_urns(self) -> Set[str]: + return set(self._schema_cache.keys()) + def get_urn_for_table(self, table: _TableName, lower: bool = False) -> str: # TODO: Validate that this is the correct 2/3 layer hierarchy for the platform. @@ -390,8 +452,6 @@ def convert_graphql_schema_metadata_to_info( ) } - # TODO add a method to load all from graphql - def close(self) -> None: self._schema_cache.close() @@ -425,14 +485,20 @@ def _column_level_lineage( # noqa: C901 default_db: Optional[str], default_schema: Optional[str], ) -> List[_ColumnLineageInfo]: - if not isinstance( - statement, - _SupportedColumnLineageTypesTuple, + is_create_ddl = _is_create_table_ddl(statement) + if ( + not isinstance( + statement, + _SupportedColumnLineageTypesTuple, + ) + and not is_create_ddl ): raise UnsupportedStatementTypeError( f"Can only generate column-level lineage for select-like inner statements, not {type(statement)}" ) + column_lineage: List[_ColumnLineageInfo] = [] + use_case_insensitive_cols = dialect in { # Column identifiers are case-insensitive in BigQuery, so we need to # do a normalization step beforehand to make sure it's resolved correctly. @@ -440,6 +506,11 @@ def _column_level_lineage( # noqa: C901 # Our snowflake source lowercases column identifiers, so we are forced # to do fuzzy (case-insensitive) resolution instead of exact resolution. "snowflake", + # Teradata column names are case-insensitive. + # A name, even when enclosed in double quotation marks, is not case sensitive. For example, CUSTOMER and Customer are the same. + # See more below: + # https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/acreldb/n0ejgx4895bofnn14rlguktfx5r3.htm + "teradata", } sqlglot_db_schema = sqlglot.MappingSchema( @@ -515,17 +586,20 @@ def _schema_aware_fuzzy_column_resolve( # - the select instead of the full outer statement # - schema info # - column qualification enabled + # - running the full pre-type annotation optimizer # logger.debug("Schema: %s", sqlglot_db_schema.mapping) - statement = sqlglot.optimizer.qualify.qualify( + statement = sqlglot.optimizer.optimizer.optimize( statement, dialect=dialect, schema=sqlglot_db_schema, + qualify_columns=True, validate_qualify_columns=False, identify=True, # sqlglot calls the db -> schema -> table hierarchy "catalog", "db", "table". catalog=default_db, db=default_schema, + rules=RULES_BEFORE_TYPE_ANNOTATION, ) except (sqlglot.errors.OptimizeError, ValueError) as e: raise SqlUnderstandingError( @@ -533,7 +607,46 @@ def _schema_aware_fuzzy_column_resolve( ) from e logger.debug("Qualified sql %s", statement.sql(pretty=True, dialect=dialect)) - column_lineage = [] + # Handle the create DDL case. + if is_create_ddl: + assert ( + output_table is not None + ), "output_table must be set for create DDL statements" + + create_schema: sqlglot.exp.Schema = statement.this + sqlglot_columns = create_schema.expressions + + for column_def in sqlglot_columns: + if not isinstance(column_def, sqlglot.exp.ColumnDef): + # Ignore things like constraints. + continue + + output_col = _schema_aware_fuzzy_column_resolve( + output_table, column_def.name + ) + output_col_type = column_def.args.get("kind") + + column_lineage.append( + _ColumnLineageInfo( + downstream=_DownstreamColumnRef( + table=output_table, + column=output_col, + column_type=output_col_type, + ), + upstreams=[], + ) + ) + + return column_lineage + + # Try to figure out the types of the output columns. + try: + statement = sqlglot.optimizer.annotate_types.annotate_types( + statement, schema=sqlglot_db_schema + ) + except (sqlglot.errors.OptimizeError, sqlglot.errors.ParseError) as e: + # This is not a fatal error, so we can continue. + logger.debug("sqlglot failed to annotate or parse types: %s", e) try: assert isinstance(statement, _SupportedColumnLineageTypesTuple) @@ -543,9 +656,7 @@ def _schema_aware_fuzzy_column_resolve( (select_col.alias_or_name, select_col) for select_col in statement.selects ] logger.debug("output columns: %s", [col[0] for col in output_columns]) - output_col: str for output_col, original_col_expression in output_columns: - # print(f"output column: {output_col}") if output_col == "*": # If schema information is available, the * will be expanded to the actual columns. # Otherwise, we can't process it. @@ -573,7 +684,7 @@ def _schema_aware_fuzzy_column_resolve( # Generate SELECT lineage. # Using a set here to deduplicate upstreams. - direct_col_upstreams: Set[_ColumnRef] = set() + direct_raw_col_upstreams: Set[_ColumnRef] = set() for node in lineage_node.walk(): if node.downstream: # We only want the leaf nodes. @@ -588,8 +699,9 @@ def _schema_aware_fuzzy_column_resolve( if node.subfield: normalized_col = f"{normalized_col}.{node.subfield}" - col = _schema_aware_fuzzy_column_resolve(table_ref, normalized_col) - direct_col_upstreams.add(_ColumnRef(table=table_ref, column=col)) + direct_raw_col_upstreams.add( + _ColumnRef(table=table_ref, column=normalized_col) + ) else: # This branch doesn't matter. For example, a count(*) column would go here, and # we don't get any column-level lineage for that. @@ -605,19 +717,35 @@ def _schema_aware_fuzzy_column_resolve( output_col = _schema_aware_fuzzy_column_resolve(output_table, output_col) - if not direct_col_upstreams: + # Guess the output column type. + output_col_type = None + if original_col_expression.type: + output_col_type = original_col_expression.type + + # Fuzzy resolve upstream columns. + direct_resolved_col_upstreams = { + _ColumnRef( + table=edge.table, + column=_schema_aware_fuzzy_column_resolve(edge.table, edge.column), + ) + for edge in direct_raw_col_upstreams + } + + if not direct_resolved_col_upstreams: logger.debug(f' "{output_col}" has no upstreams') column_lineage.append( _ColumnLineageInfo( downstream=_DownstreamColumnRef( - table=output_table, column=output_col + table=output_table, + column=output_col, + column_type=output_col_type, ), - upstreams=sorted(direct_col_upstreams), + upstreams=sorted(direct_resolved_col_upstreams), # logic=column_logic.sql(pretty=True, dialect=dialect), ) ) - # TODO: Also extract referenced columns (e.g. non-SELECT lineage) + # TODO: Also extract referenced columns (aka auxillary / non-SELECT lineage) except (sqlglot.errors.OptimizeError, ValueError) as e: raise SqlUnderstandingError( f"sqlglot failed to compute some lineage: {e}" @@ -638,6 +766,78 @@ def _extract_select_from_create( return statement +_UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT: Set[str] = set( + sqlglot.exp.Update.arg_types.keys() +) - set(sqlglot.exp.Select.arg_types.keys()) +_UPDATE_FROM_TABLE_ARGS_TO_MOVE = {"joins", "laterals", "pivot"} + + +def _extract_select_from_update( + statement: sqlglot.exp.Update, +) -> sqlglot.exp.Select: + statement = statement.copy() + + # The "SET" expressions need to be converted. + # For the update command, it'll be a list of EQ expressions, but the select + # should contain aliased columns. + new_expressions = [] + for expr in statement.expressions: + if isinstance(expr, sqlglot.exp.EQ) and isinstance( + expr.left, sqlglot.exp.Column + ): + new_expressions.append( + sqlglot.exp.Alias( + this=expr.right, + alias=expr.left.this, + ) + ) + else: + # If we don't know how to convert it, just leave it as-is. If this causes issues, + # they'll get caught later. + new_expressions.append(expr) + + # Special translation for the `from` clause. + extra_args = {} + original_from = statement.args.get("from") + if original_from and isinstance(original_from.this, sqlglot.exp.Table): + # Move joins, laterals, and pivots from the Update->From->Table->field + # to the top-level Select->field. + + for k in _UPDATE_FROM_TABLE_ARGS_TO_MOVE: + if k in original_from.this.args: + # Mutate the from table clause in-place. + extra_args[k] = original_from.this.args.pop(k) + + select_statement = sqlglot.exp.Select( + **{ + **{ + k: v + for k, v in statement.args.items() + if k not in _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT + }, + **extra_args, + "expressions": new_expressions, + } + ) + + # Update statements always implicitly have the updated table in context. + # TODO: Retain table name alias, if one was present. + if select_statement.args.get("from"): + select_statement = select_statement.join( + statement.this, append=True, join_kind="cross" + ) + else: + select_statement = select_statement.from_(statement.this) + + return select_statement + + +def _is_create_table_ddl(statement: sqlglot.exp.Expression) -> bool: + return isinstance(statement, sqlglot.exp.Create) and isinstance( + statement.this, sqlglot.exp.Schema + ) + + def _try_extract_select( statement: sqlglot.exp.Expression, ) -> sqlglot.exp.Expression: @@ -654,6 +854,9 @@ def _try_extract_select( elif isinstance(statement, sqlglot.exp.Insert): # TODO Need to map column renames in the expressions part of the statement. statement = statement.expression + elif isinstance(statement, sqlglot.exp.Update): + # Assumption: the output table is already captured in the modified tables list. + statement = _extract_select_from_update(statement) elif isinstance(statement, sqlglot.exp.Create): # TODO May need to map column renames. # Assumption: the output table is already captured in the modified tables list. @@ -665,9 +868,46 @@ def _try_extract_select( return statement +def _translate_sqlglot_type( + sqlglot_type: sqlglot.exp.DataType.Type, +) -> Optional[SchemaFieldDataTypeClass]: + TypeClass: Any + if sqlglot_type in sqlglot.exp.DataType.TEXT_TYPES: + TypeClass = StringTypeClass + elif sqlglot_type in sqlglot.exp.DataType.NUMERIC_TYPES or sqlglot_type in { + sqlglot.exp.DataType.Type.DECIMAL, + }: + TypeClass = NumberTypeClass + elif sqlglot_type in { + sqlglot.exp.DataType.Type.BOOLEAN, + sqlglot.exp.DataType.Type.BIT, + }: + TypeClass = BooleanTypeClass + elif sqlglot_type in { + sqlglot.exp.DataType.Type.DATE, + }: + TypeClass = DateTypeClass + elif sqlglot_type in sqlglot.exp.DataType.TEMPORAL_TYPES: + TypeClass = TimeTypeClass + elif sqlglot_type in { + sqlglot.exp.DataType.Type.ARRAY, + }: + TypeClass = ArrayTypeClass + elif sqlglot_type in { + sqlglot.exp.DataType.Type.UNKNOWN, + }: + return None + else: + logger.debug("Unknown sqlglot type: %s", sqlglot_type) + return None + + return SchemaFieldDataTypeClass(type=TypeClass()) + + def _translate_internal_column_lineage( table_name_urn_mapping: Dict[_TableName, str], raw_column_lineage: _ColumnLineageInfo, + dialect: str, ) -> ColumnLineageInfo: downstream_urn = None if raw_column_lineage.downstream.table: @@ -676,6 +916,18 @@ def _translate_internal_column_lineage( downstream=DownstreamColumnRef( table=downstream_urn, column=raw_column_lineage.downstream.column, + column_type=_translate_sqlglot_type( + raw_column_lineage.downstream.column_type.this + ) + if raw_column_lineage.downstream.column_type + else None, + native_column_type=raw_column_lineage.downstream.column_type.sql( + dialect=dialect + ) + if raw_column_lineage.downstream.column_type + and raw_column_lineage.downstream.column_type.this + != sqlglot.exp.DataType.Type.UNKNOWN + else None, ), upstreams=[ ColumnRef( @@ -692,12 +944,14 @@ def _get_dialect(platform: str) -> str: # TODO: convert datahub platform names to sqlglot dialect if platform == "presto-on-hive": return "hive" + if platform == "mssql": + return "tsql" else: return platform def _sqlglot_lineage_inner( - sql: str, + sql: sqlglot.exp.ExpOrStr, schema_resolver: SchemaResolver, default_db: Optional[str] = None, default_schema: Optional[str] = None, @@ -747,7 +1001,7 @@ def _sqlglot_lineage_inner( # Fetch schema info for the relevant tables. table_name_urn_mapping: Dict[_TableName, str] = {} table_name_schema_mapping: Dict[_TableName, SchemaInfo] = {} - for table in itertools.chain(tables, modified): + for table in tables | modified: # For select statements, qualification will be a no-op. For other statements, this # is where the qualification actually happens. qualified_table = table.qualified( @@ -763,7 +1017,7 @@ def _sqlglot_lineage_inner( # Also include the original, non-qualified table name in the urn mapping. table_name_urn_mapping[table] = urn - total_tables_discovered = len(tables) + len(modified) + total_tables_discovered = len(tables | modified) total_schemas_resolved = len(table_name_schema_mapping) debug_info = SqlParsingDebugInfo( confidence=0.9 if total_tables_discovered == total_schemas_resolved @@ -778,19 +1032,25 @@ def _sqlglot_lineage_inner( ) # Simplify the input statement for column-level lineage generation. - select_statement = _try_extract_select(statement) + try: + select_statement = _try_extract_select(statement) + except Exception as e: + logger.debug(f"Failed to extract select from statement: {e}", exc_info=True) + debug_info.column_error = e + select_statement = None # Generate column-level lineage. column_lineage: Optional[List[_ColumnLineageInfo]] = None try: - column_lineage = _column_level_lineage( - select_statement, - dialect=dialect, - input_tables=table_name_schema_mapping, - output_table=downstream_table, - default_db=default_db, - default_schema=default_schema, - ) + if select_statement is not None: + column_lineage = _column_level_lineage( + select_statement, + dialect=dialect, + input_tables=table_name_schema_mapping, + output_table=downstream_table, + default_db=default_db, + default_schema=default_schema, + ) except UnsupportedStatementTypeError as e: # Inject details about the outer statement type too. e.args = (f"{e.args[0]} (outer statement type: {type(statement)})",) @@ -810,7 +1070,7 @@ def _sqlglot_lineage_inner( if column_lineage: column_lineage_urns = [ _translate_internal_column_lineage( - table_name_urn_mapping, internal_col_lineage + table_name_urn_mapping, internal_col_lineage, dialect=dialect ) for internal_col_lineage in column_lineage ] @@ -906,32 +1166,56 @@ def create_lineage_sql_parsed_result( env: str, schema: Optional[str] = None, graph: Optional[DataHubGraph] = None, -) -> Optional["SqlParsingResult"]: - parsed_result: Optional["SqlParsingResult"] = None +) -> SqlParsingResult: + needs_close = False try: - schema_resolver = ( - graph._make_schema_resolver( + if graph: + schema_resolver = graph._make_schema_resolver( platform=platform, platform_instance=platform_instance, env=env, ) - if graph is not None - else SchemaResolver( + else: + needs_close = True + schema_resolver = SchemaResolver( platform=platform, platform_instance=platform_instance, env=env, graph=None, ) - ) - parsed_result = sqlglot_lineage( + return sqlglot_lineage( query, schema_resolver=schema_resolver, default_db=database, default_schema=schema, ) except Exception as e: - logger.debug(f"Fail to prase query {query}", exc_info=e) - logger.warning("Fail to parse custom SQL") + return SqlParsingResult( + in_tables=[], + out_tables=[], + column_lineage=None, + debug_info=SqlParsingDebugInfo( + table_error=e, + ), + ) + finally: + if needs_close: + schema_resolver.close() - return parsed_result + +def view_definition_lineage_helper( + result: SqlParsingResult, view_urn: str +) -> SqlParsingResult: + if result.query_type is QueryType.SELECT: + # Some platforms (e.g. postgres) store only ` . For such view definitions, `result.out_tables` and + # `result.column_lineage[].downstream` are empty in `sqlglot_lineage` response, whereas upstream + # details and downstream column details are extracted correctly. + # Here, we inject view V's urn in `result.out_tables` and `result.column_lineage[].downstream` + # to get complete lineage result. + result.out_tables = [view_urn] + if result.column_lineage: + for col_result in result.column_lineage: + col_result.downstream.table = view_urn + return result diff --git a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py index 261f95331af612..169a4ac3649a33 100644 --- a/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py +++ b/metadata-ingestion/src/datahub/utilities/urns/urn_iter.py @@ -3,7 +3,11 @@ from avro.schema import Field, RecordSchema from datahub.emitter.mcp import MetadataChangeProposalWrapper -from datahub.metadata.schema_classes import DictWrapper +from datahub.metadata.schema_classes import ( + DictWrapper, + MetadataChangeEventClass, + MetadataChangeProposalClass, +) from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import Urn, guess_entity_type @@ -32,7 +36,7 @@ def list_urns_with_path( if isinstance(model, MetadataChangeProposalWrapper): if model.entityUrn: - urns.append((model.entityUrn, ["urn"])) + urns.append((model.entityUrn, ["entityUrn"])) if model.entityKeyAspect: urns.extend( _add_prefix_to_paths( @@ -83,7 +87,15 @@ def list_urns(model: Union[DictWrapper, MetadataChangeProposalWrapper]) -> List[ return [urn for urn, _ in list_urns_with_path(model)] -def transform_urns(model: DictWrapper, func: Callable[[str], str]) -> None: +def transform_urns( + model: Union[ + DictWrapper, + MetadataChangeEventClass, + MetadataChangeProposalClass, + MetadataChangeProposalWrapper, + ], + func: Callable[[str], str], +) -> None: """ Rewrites all URNs in the given object according to the given function. """ @@ -95,7 +107,9 @@ def transform_urns(model: DictWrapper, func: Callable[[str], str]) -> None: def _modify_at_path( - model: Union[DictWrapper, list], path: _Path, new_value: str + model: Union[DictWrapper, MetadataChangeProposalWrapper, list], + path: _Path, + new_value: str, ) -> None: assert len(path) > 0 @@ -103,15 +117,17 @@ def _modify_at_path( if isinstance(path[0], int): assert isinstance(model, list) model[path[0]] = new_value - else: - assert isinstance(model, DictWrapper) + elif isinstance(model, DictWrapper): model._inner_dict[path[0]] = new_value + else: # MCPW + setattr(model, path[0], new_value) elif isinstance(path[0], int): assert isinstance(model, list) - return _modify_at_path(model[path[0]], path[1:], new_value) - else: - assert isinstance(model, DictWrapper) - return _modify_at_path(model._inner_dict[path[0]], path[1:], new_value) + _modify_at_path(model[path[0]], path[1:], new_value) + elif isinstance(model, DictWrapper): + _modify_at_path(model._inner_dict[path[0]], path[1:], new_value) + else: # MCPW + _modify_at_path(getattr(model, path[0]), path[1:], new_value) def _lowercase_dataset_urn(dataset_urn: str) -> str: @@ -120,7 +136,14 @@ def _lowercase_dataset_urn(dataset_urn: str) -> str: return str(cur_urn) -def lowercase_dataset_urns(model: DictWrapper) -> None: +def lowercase_dataset_urns( + model: Union[ + DictWrapper, + MetadataChangeEventClass, + MetadataChangeProposalClass, + MetadataChangeProposalWrapper, + ] +) -> None: def modify_urn(urn: str) -> str: if guess_entity_type(urn) == "dataset": return _lowercase_dataset_urn(urn) diff --git a/metadata-ingestion/tests/integration/delta_lake/delta_lake_minio_mces_golden.json b/metadata-ingestion/tests/integration/delta_lake/delta_lake_minio_mces_golden.json index 52e92d27549f05..ed65d740377967 100644 --- a/metadata-ingestion/tests/integration/delta_lake/delta_lake_minio_mces_golden.json +++ b/metadata-ingestion/tests/integration/delta_lake/delta_lake_minio_mces_golden.json @@ -136,7 +136,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -156,7 +157,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -171,7 +173,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -186,7 +189,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -203,7 +207,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -218,7 +223,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -238,7 +244,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -253,7 +260,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -268,7 +276,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -285,7 +294,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -300,7 +310,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -320,7 +331,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -335,7 +347,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -355,14 +368,16 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "version": "0" }, "lastUpdatedTimestamp": 1655664815399 } }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } }, { @@ -386,7 +401,8 @@ }, "systemMetadata": { "lastObserved": 1672531200000, - "runId": "delta-lake-test" + "runId": "delta-lake-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_allow_table.json b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_allow_table.json index 4dcdf71ce00951..6ec6eb2809a105 100644 --- a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_allow_table.json +++ b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_allow_table.json @@ -94,7 +94,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -115,7 +116,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -130,7 +132,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -146,7 +149,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -163,7 +167,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -183,7 +188,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -204,7 +210,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -219,7 +226,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -235,7 +243,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -252,7 +261,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -267,7 +277,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -291,7 +302,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -312,7 +324,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -327,7 +340,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -343,7 +357,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -360,7 +375,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -375,7 +391,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -403,7 +420,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -424,7 +442,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -439,7 +458,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -455,7 +475,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -472,7 +493,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -487,7 +509,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -519,7 +542,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -540,7 +564,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -555,7 +580,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -571,7 +597,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -588,7 +615,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -603,7 +631,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -639,7 +668,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -654,7 +684,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -674,14 +705,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831476907 + "lastUpdatedTimestamp": 1655831477768 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -702,14 +736,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831477701 + "lastUpdatedTimestamp": 1655831477745 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -730,14 +766,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831477726 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -758,14 +796,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831477745 + "lastUpdatedTimestamp": 1655831477701 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -786,14 +826,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831477768 + "lastUpdatedTimestamp": 1655831476907 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -833,7 +874,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -973,7 +1015,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -988,7 +1031,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1008,14 +1052,16 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "version": "0" }, "lastUpdatedTimestamp": 1655664815399 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1055,7 +1101,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1152,7 +1199,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1167,7 +1215,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1187,14 +1236,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831649166 + "lastUpdatedTimestamp": 1655831649788 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1215,14 +1267,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831649715 + "lastUpdatedTimestamp": 1655831649754 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1243,14 +1297,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831649731 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1271,14 +1327,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831649754 + "lastUpdatedTimestamp": 1655831649715 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1299,14 +1357,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831649788 + "lastUpdatedTimestamp": 1655831649166 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1346,7 +1405,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1444,7 +1504,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1465,7 +1526,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1480,7 +1542,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1496,7 +1559,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1513,7 +1577,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1528,7 +1593,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1568,7 +1634,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1583,7 +1650,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1603,14 +1671,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831865396 + "lastUpdatedTimestamp": 1655831866541 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1631,14 +1702,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831866337 + "lastUpdatedTimestamp": 1655831866447 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1659,14 +1732,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831866398 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1687,14 +1762,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831866447 + "lastUpdatedTimestamp": 1655831866337 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1715,14 +1792,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831866541 + "lastUpdatedTimestamp": 1655831865396 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1766,7 +1844,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "allow_table.json" + "runId": "allow_table.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_inner_table.json b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_inner_table.json index 901e4c1262d3f3..715beebfe22fb1 100644 --- a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_inner_table.json +++ b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_inner_table.json @@ -94,7 +94,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +115,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +131,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -144,7 +147,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +165,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -176,7 +181,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -196,7 +202,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -211,7 +218,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -226,7 +234,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -243,7 +252,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -258,7 +268,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -278,7 +289,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -298,7 +310,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -313,7 +326,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -328,7 +342,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -345,7 +360,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -360,7 +376,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -384,7 +401,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -404,7 +422,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -419,7 +438,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -434,7 +454,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -451,7 +472,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -466,7 +488,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -494,7 +517,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -514,7 +538,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -529,7 +554,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -544,7 +570,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -561,7 +588,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -576,7 +604,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -608,7 +637,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -623,7 +653,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -643,14 +674,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831476907 + "lastUpdatedTimestamp": 1655831477768 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -671,14 +705,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831477701 + "lastUpdatedTimestamp": 1655831477745 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -699,14 +735,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831477726 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -727,14 +765,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831477745 + "lastUpdatedTimestamp": 1655831477701 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -755,14 +795,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831477768 + "lastUpdatedTimestamp": 1655831476907 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -798,7 +839,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -938,7 +980,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -953,7 +996,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -973,14 +1017,16 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "version": "0" }, "lastUpdatedTimestamp": 1655664815399 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1016,7 +1062,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1113,7 +1160,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1128,7 +1176,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1148,14 +1197,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831649166 + "lastUpdatedTimestamp": 1655831649788 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1176,14 +1228,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831649715 + "lastUpdatedTimestamp": 1655831649754 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1204,14 +1258,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831649731 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1232,14 +1288,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831649754 + "lastUpdatedTimestamp": 1655831649715 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1260,14 +1318,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831649788 + "lastUpdatedTimestamp": 1655831649166 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1303,7 +1362,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1401,7 +1461,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1421,7 +1482,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1436,7 +1498,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1451,7 +1514,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1468,7 +1532,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1483,7 +1548,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1519,7 +1585,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1534,7 +1601,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1554,14 +1622,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831865396 + "lastUpdatedTimestamp": 1655831866541 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1582,14 +1653,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831866337 + "lastUpdatedTimestamp": 1655831866447 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1610,14 +1683,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831866398 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1638,14 +1713,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831866447 + "lastUpdatedTimestamp": 1655831866337 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1666,14 +1743,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831866541 + "lastUpdatedTimestamp": 1655831865396 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -1713,7 +1791,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "inner_table.json" + "runId": "inner_table.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_relative_path.json b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_relative_path.json index 18474e819334ee..2076ec4096f687 100644 --- a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_relative_path.json +++ b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_relative_path.json @@ -94,7 +94,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +115,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +131,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -144,7 +147,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +165,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -176,7 +181,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -191,7 +197,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -211,14 +218,17 @@ "customProperties": { "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", - "isolationLevel": "Serializable" + "isolationLevel": "Serializable", + "readVersion": "3", + "version": "4" }, - "lastUpdatedTimestamp": 1655831476907 + "lastUpdatedTimestamp": 1655831477768 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -239,14 +249,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "0" + "readVersion": "2", + "version": "3" }, - "lastUpdatedTimestamp": 1655831477701 + "lastUpdatedTimestamp": 1655831477745 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -267,14 +279,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "1" + "readVersion": "1", + "version": "2" }, "lastUpdatedTimestamp": 1655831477726 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -295,14 +309,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "2" + "readVersion": "0", + "version": "1" }, - "lastUpdatedTimestamp": 1655831477745 + "lastUpdatedTimestamp": 1655831477701 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -323,14 +339,15 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "version": "0" }, - "lastUpdatedTimestamp": 1655831477768 + "lastUpdatedTimestamp": 1655831476907 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } }, { @@ -350,7 +367,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "relative_path.json" + "runId": "relative_path.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_single_table.json b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_single_table.json index bb47a077e878b4..42e3b19612c2b8 100644 --- a/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_single_table.json +++ b/metadata-ingestion/tests/integration/delta_lake/golden_files/local/golden_mces_single_table.json @@ -93,7 +93,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -113,7 +114,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -128,7 +130,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -143,7 +146,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -160,7 +164,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -175,7 +180,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -195,7 +201,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -210,7 +217,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -225,7 +233,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -242,7 +251,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -257,7 +267,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -277,7 +288,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -297,7 +309,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -312,7 +325,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -327,7 +341,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -344,7 +359,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -359,7 +375,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -383,7 +400,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -403,7 +421,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -418,7 +437,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -433,7 +453,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -450,7 +471,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -465,7 +487,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -493,7 +516,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -513,7 +537,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -528,7 +553,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -543,7 +569,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -560,7 +587,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -575,7 +603,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -607,7 +636,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -622,7 +652,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -643,14 +674,16 @@ "engineInfo": "local Delta-Standalone/0.4.0", "isBlindAppend": "True", "isolationLevel": "Serializable", - "readVersion": "3" + "readVersion": "3", + "version": "4" }, "lastUpdatedTimestamp": 1655831477768 } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } }, { @@ -686,7 +719,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "single_table.json" + "runId": "single_table.json", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json b/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json index f3b6d2b8138cc5..6774d4c7055b99 100644 --- a/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json +++ b/metadata-ingestion/tests/integration/hive/hive_mces_all_db_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -93,7 +98,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -111,7 +117,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore", @@ -121,7 +127,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721972", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -187,7 +193,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -204,7 +211,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -224,7 +232,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -239,7 +248,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -257,17 +267,19 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test", "Table Type:": "MANAGED_TABLE", "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: another.comment": "This table has no partitions", + "Table Parameters: comment": "This table has array of structs", "Table Parameters: numFiles": "1", "Table Parameters: numRows": "1", "Table Parameters: rawDataSize": "32", "Table Parameters: totalSize": "33", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721976", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -278,6 +290,7 @@ "Storage Desc Params: serialization.format": "1" }, "name": "array_struct_test", + "description": "This table has array of structs", "tags": [] } }, @@ -304,6 +317,7 @@ { "fieldPath": "property_id", "nullable": true, + "description": "id of property", "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -316,6 +330,7 @@ { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", "nullable": true, + "description": "service types and providers", "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -368,7 +383,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -385,7 +401,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -405,7 +422,189 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Table Type:": "VIRTUAL_VIEW", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "null", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.array_struct_test", + "View Expanded Text:": "select `array_struct_test`.`property_id`, `array_struct_test`.`service` from `db1`.`array_struct_test`", + "View Rewrite Enabled:": "No" + }, + "name": "array_struct_test_view", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.array_struct_test_view", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -420,7 +619,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -438,7 +638,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test", @@ -448,7 +648,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721978", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -518,7 +718,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -535,7 +736,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -555,7 +757,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -570,7 +773,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -588,7 +792,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test", @@ -598,7 +802,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721978", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -717,7 +921,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -734,7 +939,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -754,7 +960,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -769,7 +976,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -787,16 +995,17 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:22 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:08 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", "Table Type:": "MANAGED_TABLE", "Table Parameters: numFiles": "1", + "Table Parameters: numPartitions": "1", "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1688578704", + "Table Parameters: transient_lastDdlTime": "1697721968", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -853,6 +1062,18 @@ "nativeDataType": "string", "recursive": false, "isPartOfKey": false + }, + { + "fieldPath": "baz", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false } ] } @@ -862,7 +1083,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -879,7 +1101,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -899,7 +1122,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -914,7 +1138,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -932,7 +1157,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test", @@ -942,7 +1167,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721972", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1039,7 +1264,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1056,7 +1282,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1076,7 +1303,188 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test_view_materialized", + "Table Type:": "MATERIALIZED_VIEW", + "Table Parameters: numFiles": "0", + "Table Parameters: totalSize": "0", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", + "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.struct_test", + "View Expanded Text:": "null", + "View Rewrite Enabled:": "No" + }, + "name": "struct_test_view_materialized", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.struct_test_view_materialized", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "struct>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1091,7 +1499,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1109,7 +1518,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test", @@ -1119,10 +1528,10 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", - "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", - "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", - "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", + "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", "Compressed:": "No", "Num Buckets:": "-1", "Bucket Columns:": "[]", @@ -1285,7 +1694,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1302,7 +1712,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1322,7 +1733,26 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW `db1.array_struct_test_view` AS select `array_struct_test`.`property_id`, `array_struct_test`.`service` from `db1`.`array_struct_test`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1342,7 +1772,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1357,7 +1788,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1372,7 +1804,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1389,7 +1822,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1404,7 +1838,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1419,7 +1854,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1437,7 +1873,7 @@ "customProperties": { "Database:": "db2", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:24 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:10 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db2.db/pokes", @@ -1446,7 +1882,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721971", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1454,10 +1890,7 @@ "Num Buckets:": "-1", "Bucket Columns:": "[]", "Sort Columns:": "[]", - "Storage Desc Params: serialization.format": "1", - "Table:": "db2.pokes", - "Constraint Name:": "pk_1173723383_1683022998392_0", - "Column Names:": "foo" + "Storage Desc Params: serialization.format": "1" }, "name": "pokes", "tags": [] @@ -1515,7 +1948,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1532,7 +1966,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1552,7 +1987,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1572,7 +2008,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1587,7 +2024,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1602,7 +2040,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1619,7 +2058,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1634,7 +2074,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json index 08f281f398909b..e93924049f626c 100644 --- a/metadata-ingestion/tests/integration/hive/hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/hive/hive_mces_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -93,7 +98,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -111,7 +117,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/_test_table_underscore", @@ -121,7 +127,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721972", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -187,7 +193,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -204,7 +211,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -224,7 +232,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -239,7 +248,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -257,17 +267,19 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/array_struct_test", "Table Type:": "MANAGED_TABLE", "Table Parameters: COLUMN_STATS_ACCURATE": "{\\\"BASIC_STATS\\\":\\\"true\\\"}", + "Table Parameters: another.comment": "This table has no partitions", + "Table Parameters: comment": "This table has array of structs", "Table Parameters: numFiles": "1", "Table Parameters: numRows": "1", "Table Parameters: rawDataSize": "32", "Table Parameters: totalSize": "33", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721976", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -278,6 +290,7 @@ "Storage Desc Params: serialization.format": "1" }, "name": "array_struct_test", + "description": "This table has array of structs", "tags": [] } }, @@ -304,6 +317,7 @@ { "fieldPath": "property_id", "nullable": true, + "description": "id of property", "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -316,6 +330,7 @@ { "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", "nullable": true, + "description": "service types and providers", "type": { "type": { "com.linkedin.pegasus2avro.schema.ArrayType": { @@ -368,7 +383,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -385,7 +401,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -405,7 +422,189 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Table Type:": "VIRTUAL_VIEW", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "null", + "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.array_struct_test", + "View Expanded Text:": "select `array_struct_test`.`property_id`, `array_struct_test`.`service` from `db1`.`array_struct_test`", + "View Rewrite Enabled:": "No" + }, + "name": "array_struct_test_view", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.array_struct_test_view", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "record" + ] + } + } + }, + "nativeDataType": "array>>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array>>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=array].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -420,7 +619,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -438,7 +638,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/map_test", @@ -448,7 +648,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721978", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -518,7 +718,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -535,7 +736,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -555,7 +757,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -570,7 +773,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -588,7 +792,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/nested_struct_test", @@ -598,7 +802,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", + "Table Parameters: transient_lastDdlTime": "1697721978", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -717,7 +921,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -734,7 +939,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -754,7 +960,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -769,7 +976,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -787,16 +995,17 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:22 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:08 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/pokes", "Table Type:": "MANAGED_TABLE", "Table Parameters: numFiles": "1", + "Table Parameters: numPartitions": "1", "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "5812", - "Table Parameters: transient_lastDdlTime": "1688578704", + "Table Parameters: transient_lastDdlTime": "1697721968", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -853,6 +1062,18 @@ "nativeDataType": "string", "recursive": false, "isPartOfKey": false + }, + { + "fieldPath": "baz", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false } ] } @@ -862,7 +1083,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -879,7 +1101,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -899,7 +1122,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -914,7 +1138,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -932,7 +1157,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:26 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:12 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test", @@ -942,7 +1167,7 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578706", + "Table Parameters: transient_lastDdlTime": "1697721972", "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", @@ -1039,7 +1264,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1056,7 +1282,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1076,7 +1303,188 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "Database:": "db1", + "Owner:": "root", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", + "LastAccessTime:": "UNKNOWN", + "Retention:": "0", + "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/struct_test_view_materialized", + "Table Type:": "MATERIALIZED_VIEW", + "Table Parameters: numFiles": "0", + "Table Parameters: totalSize": "0", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", + "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", + "Compressed:": "No", + "Num Buckets:": "-1", + "Bucket Columns:": "[]", + "Sort Columns:": "[]", + "View Original Text:": "select * from db1.struct_test", + "View Expanded Text:": "null", + "View Rewrite Enabled:": "No" + }, + "name": "struct_test_view_materialized", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "db1.struct_test_view_materialized", + "platform": "urn:li:dataPlatform:hive", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "property_id", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "int", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.RecordType": {} + } + }, + "nativeDataType": "struct>", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"struct>\"}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=string].type", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"string\", \"_nullable\": true}" + }, + { + "fieldPath": "[version=2.0].[type=struct].[type=struct].service.[type=array].[type=int].provider", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": { + "nestedType": [ + "int" + ] + } + } + }, + "nativeDataType": "array", + "recursive": false, + "isPartOfKey": false, + "jsonProps": "{\"native_data_type\": \"array\"}" + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.struct_test_view_materialized,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:ded36d15fcfbbb939830549697122661", + "urn": "urn:li:container:ded36d15fcfbbb939830549697122661" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1091,7 +1499,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1109,7 +1518,7 @@ "customProperties": { "Database:": "db1", "Owner:": "root", - "CreateTime:": "Wed Jul 05 17:38:30 UTC 2023", + "CreateTime:": "Thu Oct 19 13:26:18 UTC 2023", "LastAccessTime:": "UNKNOWN", "Retention:": "0", "Location:": "hdfs://namenode:8020/user/hive/warehouse/db1.db/union_test", @@ -1119,10 +1528,10 @@ "Table Parameters: numRows": "0", "Table Parameters: rawDataSize": "0", "Table Parameters: totalSize": "0", - "Table Parameters: transient_lastDdlTime": "1688578710", - "SerDe Library:": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", - "InputFormat:": "org.apache.hadoop.mapred.TextInputFormat", - "OutputFormat:": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", + "Table Parameters: transient_lastDdlTime": "1697721978", + "SerDe Library:": "org.apache.hadoop.hive.ql.io.orc.OrcSerde", + "InputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcInputFormat", + "OutputFormat:": "org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat", "Compressed:": "No", "Num Buckets:": "-1", "Bucket Columns:": "[]", @@ -1285,7 +1694,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1302,7 +1712,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1322,7 +1733,26 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "hive-test" + "runId": "hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:hive,db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "CREATE VIEW `db1.array_struct_test_view` AS select `array_struct_test`.`property_id`, `array_struct_test`.`service` from `db1`.`array_struct_test`", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "hive-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/hive/hive_setup.sql b/metadata-ingestion/tests/integration/hive/hive_setup.sql index 8fb8498894bc06..323a78e24d10b3 100644 --- a/metadata-ingestion/tests/integration/hive/hive_setup.sql +++ b/metadata-ingestion/tests/integration/hive/hive_setup.sql @@ -1,10 +1,10 @@ CREATE DATABASE IF NOT EXISTS db1; CREATE DATABASE IF NOT EXISTS db2; -- Setup a "pokes" example table. -CREATE TABLE IF NOT EXISTS db1.pokes (foo INT, bar STRING); -LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db1.pokes; +CREATE TABLE IF NOT EXISTS db1.pokes (foo INT, bar STRING) PARTITIONED BY (baz STRING); +LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db1.pokes PARTITION (baz='dummy'); -CREATE TABLE IF NOT EXISTS db2.pokes (foo INT, bar STRING, CONSTRAINT pk_1173723383_1683022998392_0 primary key(foo) DISABLE NOVALIDATE NORELY); +CREATE TABLE IF NOT EXISTS db2.pokes (foo INT, bar STRING); LOAD DATA LOCAL INPATH '/opt/hive/examples/files/kv1.txt' OVERWRITE INTO TABLE db2.pokes; -- Setup a table with a special character. @@ -23,12 +23,12 @@ CREATE TABLE IF NOT EXISTS db1.struct_test CREATE TABLE IF NOT EXISTS db1.array_struct_test ( - property_id INT, + property_id INT COMMENT 'id of property', service array - >> -); + >> COMMENT 'service types and providers' +) TBLPROPERTIES ('comment' = 'This table has array of structs', 'another.comment' = 'This table has no partitions');; WITH test_data as ( @@ -39,6 +39,9 @@ test_data as ( INSERT INTO TABLE db1.array_struct_test select * from test_data; +CREATE MATERIALIZED VIEW db1.struct_test_view_materialized as select * from db1.struct_test; +CREATE VIEW db1.array_struct_test_view as select * from db1.array_struct_test; + CREATE TABLE IF NOT EXISTS db1.nested_struct_test ( property_id INT, @@ -50,9 +53,6 @@ CREATE TABLE IF NOT EXISTS db1.nested_struct_test CREATE TABLE db1.union_test( foo UNIONTYPE, struct, struct> -); +) STORED AS ORC ; -CREATE TABLE db1.map_test( - KeyValue String, - RecordId map -); \ No newline at end of file +CREATE TABLE db1.map_test(KeyValue String, RecordId map); \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json index dee85b40bb7a81..1da42b94e320cf 100644 --- a/metadata-ingestion/tests/integration/looker/golden_looker_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_looker_mces.json @@ -533,20 +533,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -566,20 +552,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -599,20 +571,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json index 72db36e63daf77..685a606a57c339 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_allow_ingest.json @@ -327,20 +327,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -360,20 +346,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -393,20 +365,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json index e5508bdb06b9e0..069788cb088ac2 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_external_project_view_mces.json @@ -327,20 +327,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -360,20 +346,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -393,20 +365,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json index b0f66e7b245c96..f1c932ebd5a707 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_file_path_ingest.json @@ -335,20 +335,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -369,20 +355,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -403,20 +375,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json index 91e13debfa0283..9521c9af4bbdcc 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_independent_look_ingest.json @@ -550,20 +550,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -583,20 +569,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -616,20 +588,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json index e93079119e4f49..dbacd52fe83de5 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest.json @@ -327,20 +327,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -360,20 +346,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -393,20 +365,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json index a9c8efa7cdb980..aaa874d9ff3483 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_joins.json @@ -351,20 +351,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -384,20 +370,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -417,20 +389,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json index edd15624a14cd4..be8db0722aea33 100644 --- a/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json +++ b/metadata-ingestion/tests/integration/looker/golden_test_ingest_unaliased_joins.json @@ -343,20 +343,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -376,20 +362,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -409,20 +381,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json index aebc89b609a08b..05b74f163ad45f 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_golden_deleted_stateful.json @@ -327,20 +327,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -360,20 +346,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -393,20 +365,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json index 34bded3cf691e5..0778aa0050b007 100644 --- a/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json +++ b/metadata-ingestion/tests/integration/looker/looker_mces_usage_history.json @@ -279,20 +279,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -312,20 +298,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -345,20 +317,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json index 238f4c2580cdf2..5a0bd4e12fd3a2 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_bigquery.json @@ -2121,20 +2121,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2154,20 +2140,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2187,20 +2159,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json index 45d5d839e9d21c..1b0ee3216383cd 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_api_hive2.json @@ -2121,20 +2121,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2154,20 +2140,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2187,20 +2159,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json index 187cedaefb6b21..b960ba581e6b57 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_badsql_parser.json @@ -2004,20 +2004,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2037,20 +2023,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2070,20 +2042,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json index c2c879e38f37bb..e29292a44c949d 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline.json @@ -2121,20 +2121,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2154,20 +2140,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2187,20 +2159,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json index c1ac54b0fb588d..04ecaecbd4afb2 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_deny_pattern.json @@ -584,20 +584,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -617,20 +603,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -650,20 +622,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json index f602ca37b31607..080931ae637bc8 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_offline_platform_instance.json @@ -2121,20 +2121,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2154,20 +2140,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2187,20 +2159,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json index 104bd365669e34..5826c4316b539d 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_mces_with_external_urls.json @@ -2134,20 +2134,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -2167,20 +2153,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -2200,20 +2172,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json index 37a6c94c6952e3..53d1ec0229de16 100644 --- a/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json +++ b/metadata-ingestion/tests/integration/lookml/lookml_reachable_views.json @@ -681,20 +681,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Dimension", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Dimension", @@ -714,20 +700,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Temporal", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Temporal", @@ -747,20 +719,6 @@ "com.linkedin.pegasus2avro.metadata.snapshot.TagSnapshot": { "urn": "urn:li:tag:Measure", "aspects": [ - { - "com.linkedin.pegasus2avro.common.Ownership": { - "owners": [ - { - "owner": "urn:li:corpuser:datahub", - "type": "DATAOWNER" - } - ], - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - } - } - }, { "com.linkedin.pegasus2avro.tag.TagProperties": { "name": "Measure", diff --git a/metadata-ingestion/tests/integration/mongodb/mongodb_mces_golden.json b/metadata-ingestion/tests/integration/mongodb/mongodb_mces_golden.json index 1f662cfe514e2c..ec3fd80e6a6ea4 100644 --- a/metadata-ingestion/tests/integration/mongodb/mongodb_mces_golden.json +++ b/metadata-ingestion/tests/integration/mongodb/mongodb_mces_golden.json @@ -1,4141 +1,4245 @@ [ { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,mngdb.emptyCollection,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": {}, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "emptyCollection", - "platform": "urn:li:dataPlatform:mongodb", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.Schemaless": {} - }, - "fields": [] - } + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "emptyCollection", + "platform": "urn:li:dataPlatform:mongodb", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mongodb", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "firstCollection", + "platform": "urn:li:dataPlatform:mongodb", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.BytesType": {} + } + }, + "nativeDataType": "oid", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "age", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "canSwim", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "emptyObject", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.RecordType": {} + } + }, + "nativeDataType": "OBJECT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteColor", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteFood", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.RecordType": {} + } + }, + "nativeDataType": "OBJECT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteFood.calories", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteFood.emptyObject", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.RecordType": {} + } + }, + "nativeDataType": "OBJECT", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteFood.ingredients", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.ArrayType": {} + } + }, + "nativeDataType": "ARRAY", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteFood.ingredients.color", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteFood.ingredients.from", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteFood.ingredients.name", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteFood.name", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "favoriteFood.servings", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "legs", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "mixedType", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.UnionType": {} + } + }, + "nativeDataType": "mixed", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "seen", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "servings", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "sometimesNull", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "tags", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.ArrayType": {} + } + }, + "nativeDataType": "ARRAY", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "type", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mongodb", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "largeCollection", + "platform": "urn:li:dataPlatform:mongodb", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.BytesType": {} + } + }, + "nativeDataType": "oid", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_200", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_201", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_202", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_203", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_204", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_205", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_206", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_207", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_208", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_209", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_210", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_211", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_212", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_213", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_214", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_215", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_216", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_217", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_218", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_219", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_220", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_221", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_222", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_223", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_224", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_225", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_226", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_227", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_228", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_229", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_230", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_231", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_232", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_233", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_234", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_235", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_236", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_237", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_238", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_239", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_240", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_241", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_242", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_243", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_244", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_245", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_246", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_247", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_248", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_249", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_250", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_251", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_252", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_253", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_254", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_255", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_256", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_257", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_258", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_259", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_260", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_261", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_262", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_263", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_264", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_265", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_266", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_267", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_268", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_269", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_270", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_271", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_272", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_273", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_274", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_275", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_276", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_277", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_278", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_279", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_280", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_281", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_282", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_283", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_284", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_285", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_286", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_287", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_288", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_289", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_290", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_291", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_292", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_293", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_294", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_295", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_296", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_297", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_298", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_299", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_300", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_301", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_302", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_303", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_304", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_305", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_306", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_307", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_308", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_309", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_310", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_311", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_312", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_313", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_314", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_315", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_316", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_317", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_318", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_319", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_320", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_321", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_322", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_323", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_324", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_325", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_326", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_327", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_328", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_329", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_330", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_331", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_332", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_333", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_334", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_335", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_336", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_337", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_338", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_339", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_340", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_341", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_342", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_343", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_344", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_345", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_346", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_347", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_348", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_349", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_350", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_351", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_352", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_353", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_354", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_355", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_356", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_357", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_358", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_359", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_360", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_361", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_362", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_363", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_364", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_365", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_366", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_367", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_368", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_369", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_370", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_371", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_372", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_374", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_375", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_376", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_377", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_378", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_379", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_380", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_381", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_382", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_383", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_384", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_385", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_386", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_387", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_388", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_389", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_390", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_391", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_392", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_393", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_394", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_395", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_396", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_397", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_398", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_399", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_400", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_401", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_402", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_403", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_404", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_405", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_406", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_407", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_408", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_409", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_410", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_411", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_412", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_413", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_414", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_415", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_416", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_417", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_418", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_419", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_420", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_421", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_422", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_423", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_424", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_425", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_426", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_427", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_428", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_429", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_430", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_431", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_432", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_433", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_434", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_435", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_436", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_437", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_438", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_439", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_440", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_441", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_442", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_443", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_444", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_445", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_446", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_447", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_448", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_449", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_450", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_451", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_452", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_453", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_454", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_455", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_456", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_457", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_458", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_459", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_460", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_461", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_462", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_463", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_464", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_465", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_466", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_467", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_468", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_469", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_470", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_471", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_472", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_473", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_474", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_475", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_476", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_477", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_478", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_479", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_480", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_481", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_482", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_483", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_484", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_485", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_486", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_487", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_488", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_489", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_490", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_491", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_492", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_493", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_494", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_495", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_496", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_497", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_498", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "field_499", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mongodb-test" + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" } }, { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,mngdb.firstCollection,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": {}, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "firstCollection", - "platform": "urn:li:dataPlatform:mongodb", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.Schemaless": {} - }, - "fields": [ - { - "fieldPath": "_id", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BytesType": {} - } - }, - "nativeDataType": "oid", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "age", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "canSwim", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "boolean", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "emptyObject", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "OBJECT", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteColor", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteFood", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "OBJECT", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteFood.calories", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "integer", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteFood.emptyObject", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.RecordType": {} - } - }, - "nativeDataType": "OBJECT", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteFood.ingredients", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": {} - } - }, - "nativeDataType": "ARRAY", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteFood.ingredients.color", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteFood.ingredients.from", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteFood.ingredients.name", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteFood.name", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "favoriteFood.servings", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "legs", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "integer", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "mixedType", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.UnionType": {} - } - }, - "nativeDataType": "mixed", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "name", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "seen", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "servings", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "integer", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "sometimesNull", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "tags", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": {} - } - }, - "nativeDataType": "ARRAY", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "type", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - } - ] - } - } - ] + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "schema.downsampled": "True", + "schema.totalFields": "501" + }, + "tags": [] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mongodb-test" + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" } }, { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,mngdb.largeCollection,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": { - "schema.downsampled": "True", - "schema.totalFields": "501" - }, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "largeCollection", - "platform": "urn:li:dataPlatform:mongodb", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.Schemaless": {} - }, - "fields": [ - { - "fieldPath": "_id", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BytesType": {} - } - }, - "nativeDataType": "oid", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_200", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_201", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_202", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_203", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_204", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_205", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_206", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_207", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_208", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_209", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_210", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_211", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_212", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_213", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_214", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_215", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_216", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_217", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_218", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_219", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_220", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_221", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_222", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_223", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_224", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_225", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_226", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_227", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_228", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_229", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_230", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_231", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_232", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_233", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_234", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_235", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_236", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_237", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_238", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_239", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_240", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_241", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_242", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_243", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_244", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_245", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_246", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_247", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_248", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_249", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_250", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_251", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_252", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_253", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_254", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_255", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_256", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_257", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_258", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_259", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_260", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_261", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_262", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_263", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_264", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_265", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_266", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_267", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_268", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_269", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_270", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_271", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_272", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_273", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_274", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_275", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_276", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_277", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_278", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_279", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_280", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_281", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_282", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_283", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_284", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_285", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_286", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_287", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_288", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_289", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_290", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_291", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_292", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_293", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_294", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_295", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_296", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_297", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_298", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_299", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_300", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_301", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_302", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_303", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_304", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_305", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_306", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_307", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_308", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_309", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_310", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_311", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_312", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_313", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_314", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_315", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_316", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_317", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_318", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_319", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_320", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_321", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_322", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_323", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_324", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_325", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_326", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_327", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_328", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_329", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_330", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_331", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_332", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_333", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_334", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_335", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_336", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_337", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_338", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_339", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_340", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_341", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_342", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_343", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_344", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_345", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_346", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_347", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_348", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_349", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_350", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_351", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_352", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_353", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_354", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_355", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_356", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_357", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_358", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_359", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_360", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_361", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_362", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_363", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_364", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_365", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_366", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_367", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_368", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_369", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_370", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_371", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_372", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_374", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_375", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_376", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_377", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_378", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_379", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_380", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_381", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_382", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_383", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_384", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_385", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_386", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_387", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_388", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_389", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_390", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_391", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_392", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_393", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_394", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_395", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_396", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_397", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_398", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_399", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_400", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_401", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_402", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_403", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_404", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_405", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_406", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_407", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_408", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_409", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_410", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_411", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_412", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_413", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_414", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_415", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_416", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_417", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_418", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_419", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_420", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_421", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_422", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_423", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_424", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_425", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_426", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_427", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_428", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_429", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_430", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_431", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_432", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_433", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_434", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_435", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_436", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_437", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_438", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_439", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_440", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_441", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_442", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_443", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_444", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_445", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_446", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_447", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_448", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_449", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_450", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_451", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_452", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_453", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_454", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_455", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_456", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_457", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_458", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_459", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_460", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_461", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_462", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_463", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_464", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_465", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_466", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_467", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_468", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_469", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_470", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_471", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_472", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_473", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_474", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_475", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_476", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_477", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_478", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_479", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_480", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_481", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_482", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_483", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_484", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_485", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_486", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_487", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_488", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_489", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_490", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_491", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_492", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_493", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_494", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_495", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_496", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_497", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_498", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "field_499", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - } - ] - } - } - ] + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mongodb", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mongodb-test" + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" } }, { - "proposedSnapshot": { - "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,mngdb.secondCollection,PROD)", - "aspects": [ - { - "com.linkedin.pegasus2avro.dataset.DatasetProperties": { - "customProperties": {}, - "tags": [] - } - }, - { - "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "secondCollection", - "platform": "urn:li:dataPlatform:mongodb", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.pegasus2avro.schema.Schemaless": {} - }, - "fields": [ - { - "fieldPath": "_id", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BytesType": {} - } - }, - "nativeDataType": "oid", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "mixedType", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.UnionType": {} - } - }, - "nativeDataType": "mixed", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "mixedType.fieldA", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "mixedType.fieldTwo", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "integer", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "name", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "string", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "nullableMixedType", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.UnionType": {} - } - }, - "nativeDataType": "mixed", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "rating", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "float", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "tasty", - "nullable": false, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.BooleanType": {} - } - }, - "nativeDataType": "boolean", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "varieties", - "nullable": true, - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.ArrayType": {} - } - }, - "nativeDataType": "ARRAY", - "recursive": false, - "isPartOfKey": false - } - ] - } + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "secondCollection", + "platform": "urn:li:dataPlatform:mongodb", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.Schemaless": {} + }, + "fields": [ + { + "fieldPath": "_id", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.BytesType": {} + } + }, + "nativeDataType": "oid", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "mixedType", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.UnionType": {} + } + }, + "nativeDataType": "mixed", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "mixedType.fieldA", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "mixedType.fieldTwo", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "integer", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "name", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "string", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "nullableMixedType", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.UnionType": {} + } + }, + "nativeDataType": "mixed", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "rating", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "float", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "tasty", + "nullable": false, + "type": { + "type": { + "com.linkedin.schema.BooleanType": {} + } + }, + "nativeDataType": "boolean", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "varieties", + "nullable": true, + "type": { + "type": { + "com.linkedin.schema.ArrayType": {} + } + }, + "nativeDataType": "ARRAY", + "recursive": false, + "isPartOfKey": false } ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mongodb-test" + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:mongodb", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:mongodb,instance)" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,mngdb.emptyCollection,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.emptyCollection,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -4145,12 +4249,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mongodb-test" + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,mngdb.firstCollection,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.firstCollection,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -4160,12 +4265,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mongodb-test" + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,mngdb.largeCollection,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.largeCollection,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -4175,12 +4281,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mongodb-test" + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,mngdb.secondCollection,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mongodb,instance.mngdb.secondCollection,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -4190,7 +4297,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mongodb-test" + "runId": "mongodb-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/mongodb/test_mongodb.py b/metadata-ingestion/tests/integration/mongodb/test_mongodb.py index 5228c21223e24b..56fb471d4c9f1b 100644 --- a/metadata-ingestion/tests/integration/mongodb/test_mongodb.py +++ b/metadata-ingestion/tests/integration/mongodb/test_mongodb.py @@ -25,6 +25,7 @@ def test_mongodb_ingest(docker_compose_runner, pytestconfig, tmp_path, mock_time "username": "mongoadmin", "password": "examplepass", "maxDocumentSize": 25000, + "platform_instance": "instance", }, }, "sink": { diff --git a/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json b/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json index 4aaefb48d33e15..38b03ce238d1c8 100644 --- a/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json +++ b/metadata-ingestion/tests/integration/mysql/mysql_mces_no_db_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -93,7 +98,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -213,7 +219,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -230,7 +237,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -250,7 +258,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -265,7 +274,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -361,7 +371,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -378,7 +389,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -398,7 +410,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -554,7 +567,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -969,7 +983,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -989,7 +1004,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1004,7 +1020,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1019,7 +1036,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1036,7 +1054,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1053,7 +1072,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1068,7 +1088,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1083,7 +1104,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1215,7 +1237,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1232,7 +1255,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1249,7 +1273,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1269,7 +1294,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1284,7 +1310,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1418,7 +1445,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1435,7 +1463,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1452,7 +1481,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1472,7 +1502,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1487,7 +1518,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1586,7 +1618,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1603,7 +1636,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1620,7 +1654,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1637,7 +1672,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1657,7 +1693,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1677,7 +1714,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1692,7 +1730,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1707,7 +1746,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1724,7 +1764,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1739,7 +1780,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1754,7 +1796,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1874,7 +1917,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1891,7 +1935,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1911,7 +1956,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1926,7 +1972,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2022,7 +2069,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2039,7 +2087,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2059,7 +2108,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2182,7 +2232,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2233,7 +2284,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2253,7 +2305,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2268,7 +2321,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2283,7 +2337,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2300,7 +2355,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2315,7 +2371,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2330,7 +2387,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2390,7 +2448,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2407,7 +2466,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2427,7 +2487,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2442,7 +2503,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2502,7 +2564,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2519,7 +2582,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2539,7 +2603,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2568,7 +2633,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2597,7 +2663,79 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "mysql-test" + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),doubleVal)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),doubleVal)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),path)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),path)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index,PROD),urn)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:mysql,metagalaxy.metadata_index_view,PROD),urn)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "mysql-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json b/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json index 535ce964c6058f..b9b2a3b2141a8c 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/postgres/postgres_all_db_mces_with_db_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -99,7 +104,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +120,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +136,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -146,7 +154,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +170,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -181,7 +191,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -201,7 +212,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -216,7 +228,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -231,7 +244,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -248,7 +262,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -263,7 +278,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -284,7 +300,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -299,7 +316,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -314,7 +332,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -331,7 +350,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -346,7 +366,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -366,7 +387,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -381,7 +403,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -537,7 +560,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -554,7 +578,186 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", + "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + }, + { + "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", + "is_view": "True" + }, + "name": "metadata_aspect_view", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "postgrestest.public.metadata_aspect_view", + "platform": "urn:li:dataPlatform:postgres", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "urn", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=500)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "aspect", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=200)", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", + "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + }, + { + "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -634,31 +837,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", - "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" - }, - { - "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", - "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -675,29 +855,39 @@ "actor": "urn:li:corpuser:unknown" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", - "type": "TRANSFORMED" + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" + ], + "confidenceScore": 1.0 } ] } }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", - "changeType": "UPSERT", - "aspectName": "status", - "aspect": { - "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/postgres/postgres_all_db_to_file_with_db_estimate_row_count.yml b/metadata-ingestion/tests/integration/postgres/postgres_all_db_to_file_with_db_estimate_row_count.yml index b390d9246677e9..2bfa39a65363b5 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_all_db_to_file_with_db_estimate_row_count.yml +++ b/metadata-ingestion/tests/integration/postgres/postgres_all_db_to_file_with_db_estimate_row_count.yml @@ -25,7 +25,7 @@ source: include_field_distinct_value_frequencies: false include_field_histogram: false catch_exceptions: true - include_views: false + include_views: true sink: type: file config: diff --git a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json index bf36a39a8c103f..f6fa0a0ed032ef 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/postgres/postgres_mces_with_db_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -99,7 +104,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +120,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +136,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -146,7 +154,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +170,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -181,7 +191,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -196,7 +207,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -352,7 +364,8 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -369,7 +382,186 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", + "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + }, + { + "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "view_definition": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", + "is_view": "True" + }, + "name": "metadata_aspect_view", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "postgrestest.public.metadata_aspect_view", + "platform": "urn:li:dataPlatform:postgres", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "urn", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=500)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "aspect", + "nullable": true, + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=200)", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": " SELECT metadata_aspect_v2.urn,\n metadata_aspect_v2.aspect\n FROM metadata_aspect_v2\n WHERE (metadata_aspect_v2.version = 0);", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", + "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + }, + { + "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", + "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1646575200000, + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { @@ -391,31 +583,57 @@ }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "upstreamLineage", "aspect": { "json": { - "path": [ + "upstreams": [ { - "id": "urn:li:container:a6097853edba03be190d99ece4b307ff", - "urn": "urn:li:container:a6097853edba03be190d99ece4b307ff" + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),aspect)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),aspect)" + ], + "confidenceScore": 1.0 }, { - "id": "urn:li:container:51904fc8cd5cc729bc630decff284525", - "urn": "urn:li:container:51904fc8cd5cc729bc630decff284525" + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_v2,PROD),urn)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:postgres,postgrestest.public.metadata_aspect_view,PROD),urn)" + ], + "confidenceScore": 1.0 } ] } }, "systemMetadata": { "lastObserved": 1646575200000, - "runId": "postgres-test" + "runId": "postgres-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml b/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml index a489877d52a23f..4a2cc543f2d011 100644 --- a/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml +++ b/metadata-ingestion/tests/integration/postgres/postgres_to_file_with_db_estimate_row_count.yml @@ -13,7 +13,7 @@ source: profile_table_row_count_estimate_only: true turn_off_expensive_profiling_metrics: true catch_exceptions: true - include_views: false + include_views: true sink: type: file config: diff --git a/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json b/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json new file mode 100644 index 00000000000000..5f92cdcfb5bde9 --- /dev/null +++ b/metadata-ingestion/tests/integration/powerbi/golden_test_cll.json @@ -0,0 +1,1357 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "dummy", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "public issue_history", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Snowflake.Databases(\"hp123rt5.ap-southeast-2.fakecomputing.com\",\"PBI_TEST_WAREHOUSE_PROD\",[Role=\"PBI_TEST_MEMBER\"]),\n PBI_TEST_Database = Source{[Name=\"PBI_TEST\",Kind=\"Database\"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name=\"TEST\",Kind=\"Schema\"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name=\"TESTTABLE\",Kind=\"Table\"]}[Data]\nin\n TESTTABLE_Table", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "SNOWFLAKE_TESTTABLE", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,PBI_TEST.TEST.TESTTABLE,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"bu20658.ap-southeast-2.snowflakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"SELECT#(lf)concat((UPPER(REPLACE(SELLER,'-',''))), MONTHID) as AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTHID) as CD_AGENT_KEY,#(lf) *#(lf)FROM#(lf)OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4\", null, [EnableFolding=true]),\n #\"Added Conditional Column\" = Table.AddColumn(Source, \"SME Units ENT\", each if [DEAL_TYPE] = \"SME Unit\" then [UNIT] else 0),\n #\"Added Conditional Column1\" = Table.AddColumn(#\"Added Conditional Column\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" then [UNIT] else 0),\n #\"Removed Columns\" = Table.RemoveColumns(#\"Added Conditional Column1\",{\"Banklink Units\"}),\n #\"Added Custom\" = Table.AddColumn(#\"Removed Columns\", \"Banklink Units\", each if [DEAL_TYPE] = \"Banklink\" and [SALES_TYPE] = \"3 - Upsell\"\nthen [UNIT]\n\nelse if [SALES_TYPE] = \"Adjusted BL Migration\"\nthen [UNIT]\n\nelse 0),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"SME Units in $ (*$361)\", each if [DEAL_TYPE] = \"SME Unit\" \nand [SALES_TYPE] <> \"4 - Renewal\"\n then [UNIT] * 361\nelse 0),\n #\"Added Custom2\" = Table.AddColumn(#\"Added Custom1\", \"Banklink in $ (*$148)\", each [Banklink Units] * 148)\nin\n #\"Added Custom2\"", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "snowflake native-query", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD)", + "type": "TRANSFORMED" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD),monthid)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD),seller)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV),agent_key)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD),client_director)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD),monthid)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV),cd_agent_key)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = GoogleBigQuery.Database([BillingProject = #\"Parameter - Source\"]),\n#\"gcp-project\" = Source{[Name=#\"Parameter - Source\"]}[Data],\nuniversal_Schema = #\"gcp-project\"{[Name=\"universal\",Kind=\"Schema\"]}[Data],\nD_WH_DATE_Table = universal_Schema{[Name=\"D_WH_DATE\",Kind=\"Table\"]}[Data],\n#\"Filtered Rows\" = Table.SelectRows(D_WH_DATE_Table, each [D_DATE] > #datetime(2019, 9, 10, 0, 0, 0)),\n#\"Filtered Rows1\" = Table.SelectRows(#\"Filtered Rows\", each DateTime.IsInPreviousNHours([D_DATE], 87600))\n in \n#\"Filtered Rows1\"", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "big-query-with-parameter", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Value.NativeQuery(Snowflake.Databases(\"xaa48144.snowflakecomputing.com\",\"GSL_TEST_WH\",[Role=\"ACCOUNTADMIN\"]){[Name=\"GSL_TEST_DB\"]}[Data], \"select A.name from GSL_TEST_DB.PUBLIC.SALES_ANALYST as A inner join GSL_TEST_DB.PUBLIC.SALES_FORECAST as B on A.name = B.name where startswith(A.name, 'mo')\", null, [EnableFolding=true])\nin\n Source", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "snowflake native-query-with-join", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.universal.D_WH_DATE,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Oracle.Database(\"localhost:1521/salesdb.GSLAB.COM\", [HierarchicalNavigation=true]), HR = Source{[Schema=\"HR\"]}[Data], EMPLOYEES1 = HR{[Name=\"EMPLOYEES\"]}[Data] \n in EMPLOYEES1", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_forecast,PROD)", + "type": "TRANSFORMED" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV),name)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD),name)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV),name)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "job-history", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.HR.EMPLOYEES,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = PostgreSQL.Database(\"localhost\" , \"mics\" ),\n public_order_date = Source{[Schema=\"public\",Item=\"order_date\"]}[Data] \n in \n public_order_date", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details", + "name": "postgres_test_table", + "description": "Library dataset description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Sql.Database(\"localhost\", \"library\"),\n dbo_book_issue = Source{[Schema=\"dbo\",Item=\"book_issue\"]}[Data]\n in dbo_book_issue", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details", + "name": "dbo_book_issue", + "description": "hr pbi test description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "let\n Source = Sql.Database(\"AUPRDWHDB\", \"COMMOPSDB\", [Query=\"select *,#(lf)concat((UPPER(REPLACE(CLIENT_DIRECTOR,'-',''))), MONTH_WID) as CD_AGENT_KEY,#(lf)concat((UPPER(REPLACE(CLIENT_MANAGER_CLOSING_MONTH,'-',''))), MONTH_WID) as AGENT_KEY#(lf)#(lf)from V_PS_CD_RETENTION\", CommandTimeout=#duration(0, 1, 30, 0)]),\n #\"Changed Type\" = Table.TransformColumnTypes(Source,{{\"mth_date\", type date}}),\n #\"Added Custom\" = Table.AddColumn(#\"Changed Type\", \"Month\", each Date.Month([mth_date])),\n #\"Added Custom1\" = Table.AddColumn(#\"Added Custom\", \"TPV Opening\", each if [Month] = 1 then [TPV_AMV_OPENING]\nelse if [Month] = 2 then 0\nelse if [Month] = 3 then 0\nelse if [Month] = 4 then [TPV_AMV_OPENING]\nelse if [Month] = 5 then 0\nelse if [Month] = 6 then 0\nelse if [Month] = 7 then [TPV_AMV_OPENING]\nelse if [Month] = 8 then 0\nelse if [Month] = 9 then 0\nelse if [Month] = 10 then [TPV_AMV_OPENING]\nelse if [Month] = 11 then 0\nelse if [Month] = 12 then 0\n\nelse 0)\nin\n #\"Added Custom1\"", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details", + "name": "ms_sql_native_table", + "description": "hr pbi test description", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:mssql,library.dbo.book_issue,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "json": { + "username": "User1@foo.com" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "corpUserKey", + "aspect": { + "json": { + "username": "User2@foo.com" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "json": { + "customProperties": { + "createdFrom": "Dataset", + "datasetId": "05169CD2-E713-41E6-9600-1D8066D95445", + "datasetWebUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/05169CD2-E713-41E6-9600-1D8066D95445/details" + }, + "title": "test_tile", + "description": "test_tile", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.public_issue_history,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.SNOWFLAKE_TESTTABLE,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.big-query-with-parameter,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.snowflake_native-query-with-join,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.job-history,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,library-dataset.postgres_test_table,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "chartId": "powerbi.linkedin.com/charts/B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "demo-workspace" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartInfo", + "aspect": { + "json": { + "customProperties": { + "createdFrom": "Dataset", + "datasetId": "ba0130a1-5b03-40de-9535-b34e778ea6ed", + "datasetWebUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/ba0130a1-5b03-40de-9535-b34e778ea6ed/details" + }, + "title": "yearly_sales", + "description": "yearly_sales", + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "inputs": [ + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.dbo_book_issue,DEV)" + }, + { + "string": "urn:li:dataset:(urn:li:dataPlatform:powerbi,hr_pbi_test.ms_sql_native_table,DEV)" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "chartKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "chartId": "powerbi.linkedin.com/charts/23212598-23b5-4980-87cc-5fc0ecd84385" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "chart", + "entityUrn": "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "demo-workspace" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePaths", + "aspect": { + "json": { + "paths": [ + "/powerbi/demo-workspace" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardInfo", + "aspect": { + "json": { + "customProperties": { + "chartCount": "2", + "workspaceName": "demo-workspace", + "workspaceId": "64ED5CAD-7C10-4684-8180-826122881108" + }, + "title": "test_dashboard", + "description": "Description of test dashboard", + "charts": [ + "urn:li:chart:(powerbi,charts.B8E293DC-0C83-4AA0-9BB9-0A8738DF24A0)", + "urn:li:chart:(powerbi,charts.23212598-23b5-4980-87cc-5fc0ecd84385)" + ], + "datasets": [], + "lastModified": { + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + }, + "dashboardUrl": "https://localhost/dashboards/web/1" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "dashboardKey", + "aspect": { + "json": { + "dashboardTool": "powerbi", + "dashboardId": "powerbi.linkedin.com/dashboards/7D668CAD-7FFC-4505-9215-655BCA5BEBAE" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:users.User1@foo.com", + "type": "NONE" + }, + { + "owner": "urn:li:corpuser:users.User2@foo.com", + "type": "NONE" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dashboard", + "entityUrn": "urn:li:dashboard:(powerbi,dashboards.7D668CAD-7FFC-4505-9215-655BCA5BEBAE)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "demo-workspace" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,employee-dataset.employee_ctc,DEV)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "dummy", + "viewLanguage": "m_query" + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User1@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,employee-dataset.employee_ctc,DEV)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,employee-dataset.employee_ctc,DEV)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "PowerBI Dataset Table", + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:powerbi,employee-dataset.employee_ctc,DEV)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": { + "datasetId": "91580e0e-1680-4b1c-bbf9-4f6764d7a5ff" + }, + "externalUrl": "http://localhost/groups/64ED5CAD-7C10-4684-8180-826122881108/datasets/91580e0e-1680-4b1c-bbf9-4f6764d7a5ff/details", + "name": "employee_ctc", + "description": "Employee Management", + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +}, +{ + "entityType": "corpuser", + "entityUrn": "urn:li:corpuser:users.User2@foo.com", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "powerbi-test" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py index f95fd81681a9a7..6f45dcf97f1dd7 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py +++ b/metadata-ingestion/tests/integration/powerbi/test_admin_only_api.py @@ -3,11 +3,14 @@ from typing import Any, Dict from unittest import mock +import pytest from freezegun import freeze_time from datahub.ingestion.run.pipeline import Pipeline from tests.test_helpers import mce_helpers +pytestmark = pytest.mark.integration_batch_2 + FROZEN_TIME = "2022-02-03 07:00:00" diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py index 2fcbf5a0c0860e..b6cb578217a2c2 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py +++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py @@ -15,10 +15,10 @@ AbstractDataPlatformInstanceResolver, create_dataplatform_instance_resolver, ) -from datahub.ingestion.source.powerbi.m_query import parser, tree_function -from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable +from datahub.ingestion.source.powerbi.m_query import parser, resolver, tree_function +from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable, Lineage -pytestmark = pytest.mark.slow +pytestmark = pytest.mark.integration_batch_2 M_QUERIES = [ 'let\n Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n TESTTABLE_Table', @@ -70,6 +70,15 @@ def get_default_instances( return PipelineContext(run_id="fake"), config, platform_instance_resolver +def combine_upstreams_from_lineage(lineage: List[Lineage]) -> List[DataPlatformTable]: + data_platforms: List[DataPlatformTable] = [] + + for item in lineage: + data_platforms.extend(item.upstreams) + + return data_platforms + + @pytest.mark.integration def test_parse_m_query1(): expression: str = M_QUERIES[0] @@ -182,7 +191,7 @@ def test_snowflake_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -212,7 +221,7 @@ def test_postgres_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -242,7 +251,7 @@ def test_databricks_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -272,7 +281,7 @@ def test_oracle_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -302,7 +311,7 @@ def test_mssql_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -348,7 +357,7 @@ def test_mssql_with_query(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert data_platform_tables[0].urn == expected_tables[index] @@ -388,7 +397,7 @@ def test_snowflake_native_query(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert data_platform_tables[0].urn == expected_tables[index] @@ -410,7 +419,7 @@ def test_google_bigquery_1(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -442,7 +451,7 @@ def test_google_bigquery_2(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -472,7 +481,7 @@ def test_for_each_expression_1(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -501,7 +510,7 @@ def test_for_each_expression_2(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -523,15 +532,15 @@ def test_native_query_disabled(): reporter = PowerBiDashboardSourceReport() ctx, config, platform_instance_resolver = get_default_instances() - config.native_query_parsing = False - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + config.native_query_parsing = False # Disable native query parsing + lineage: List[Lineage] = parser.get_upstream_tables( table, reporter, ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, ) - assert len(data_platform_tables) == 0 + assert len(lineage) == 0 @pytest.mark.integration @@ -548,12 +557,14 @@ def test_multi_source_table(): ctx, config, platform_instance_resolver = get_default_instances() - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, - reporter, - ctx=ctx, - config=config, - platform_instance_resolver=platform_instance_resolver, + data_platform_tables: List[DataPlatformTable] = combine_upstreams_from_lineage( + parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) assert len(data_platform_tables) == 2 @@ -581,12 +592,14 @@ def test_table_combine(): ctx, config, platform_instance_resolver = get_default_instances() - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( - table, - reporter, - ctx=ctx, - config=config, - platform_instance_resolver=platform_instance_resolver, + data_platform_tables: List[DataPlatformTable] = combine_upstreams_from_lineage( + parser.get_upstream_tables( + table, + reporter, + ctx=ctx, + config=config, + platform_instance_resolver=platform_instance_resolver, + ) ) assert len(data_platform_tables) == 2 @@ -624,7 +637,7 @@ def test_expression_is_none(): ctx, config, platform_instance_resolver = get_default_instances() - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + lineage: List[Lineage] = parser.get_upstream_tables( table, reporter, ctx=ctx, @@ -632,7 +645,7 @@ def test_expression_is_none(): platform_instance_resolver=platform_instance_resolver, ) - assert len(data_platform_tables) == 0 + assert len(lineage) == 0 def test_redshift_regular_case(): @@ -651,7 +664,7 @@ def test_redshift_regular_case(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -678,7 +691,7 @@ def test_redshift_native_query(): ctx=ctx, config=config, platform_instance_resolver=platform_instance_resolver, - ) + )[0].upstreams assert len(data_platform_tables) == 1 assert ( @@ -708,7 +721,7 @@ def test_sqlglot_parser(): } ) - data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables( + lineage: List[resolver.Lineage] = parser.get_upstream_tables( table, reporter, ctx=ctx, @@ -716,6 +729,8 @@ def test_sqlglot_parser(): platform_instance_resolver=platform_instance_resolver, ) + data_platform_tables: List[DataPlatformTable] = lineage[0].upstreams + assert len(data_platform_tables) == 2 assert ( data_platform_tables[0].urn @@ -725,3 +740,26 @@ def test_sqlglot_parser(): data_platform_tables[1].urn == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit_targets,PROD)" ) + + # TODO: None of these columns have upstreams? + # That doesn't seem right - we probably need to add fake schemas for the two tables above. + cols = [ + "client_director", + "tier", + 'upper("manager")', + "team_type", + "date_target", + "monthid", + "target_team", + "seller_email", + "agent_key", + "sme_quota", + "revenue_quota", + "service_quota", + "bl_target", + "software_quota", + ] + for i, column in enumerate(cols): + assert lineage[0].column_lineage[i].downstream.table is None + assert lineage[0].column_lineage[i].downstream.column == column + assert lineage[0].column_lineage[i].upstreams == [] diff --git a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py index 044532021a19c3..7232d2a38da1d3 100644 --- a/metadata-ingestion/tests/integration/powerbi/test_powerbi.py +++ b/metadata-ingestion/tests/integration/powerbi/test_powerbi.py @@ -1,4 +1,5 @@ import logging +import re import sys from typing import Any, Dict, List, cast from unittest import mock @@ -20,7 +21,7 @@ ) from tests.test_helpers import mce_helpers -pytestmark = pytest.mark.slow +pytestmark = pytest.mark.integration_batch_2 FROZEN_TIME = "2022-02-03 07:00:00" @@ -1127,7 +1128,7 @@ def test_dataset_type_mapping_error( """ register_mock_api(request_mock=requests_mock) - try: + with pytest.raises(Exception, match=r"dataset_type_mapping is deprecated"): Pipeline.create( { "run_id": "powerbi-test", @@ -1150,11 +1151,6 @@ def test_dataset_type_mapping_error( }, } ) - except Exception as e: - assert ( - "dataset_type_mapping is deprecated. Use server_to_platform_instance only." - in str(e) - ) @freeze_time(FROZEN_TIME) @@ -1506,3 +1502,90 @@ def test_independent_datasets_extraction( output_path=tmp_path / "powerbi_independent_mces.json", golden_path=f"{test_resources_dir}/{golden_file}", ) + + +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +def test_cll_extraction(mock_msal, pytestconfig, tmp_path, mock_time, requests_mock): + + test_resources_dir = pytestconfig.rootpath / "tests/integration/powerbi" + + register_mock_api( + request_mock=requests_mock, + ) + + default_conf: dict = default_source_config() + + del default_conf[ + "dataset_type_mapping" + ] # delete this key so that connector set it to default (all dataplatform) + + pipeline = Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_conf, + "extract_lineage": True, + "extract_column_level_lineage": True, + "enable_advance_lineage_sql_construct": True, + "native_query_parsing": True, + "extract_independent_datasets": True, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_cll_mces.json", + }, + }, + } + ) + + pipeline.run() + pipeline.raise_from_status() + golden_file = "golden_test_cll.json" + + mce_helpers.check_golden_file( + pytestconfig, + output_path=tmp_path / "powerbi_cll_mces.json", + golden_path=f"{test_resources_dir}/{golden_file}", + ) + + +@freeze_time(FROZEN_TIME) +@mock.patch("msal.ConfidentialClientApplication", side_effect=mock_msal_cca) +def test_cll_extraction_flags( + mock_msal, pytestconfig, tmp_path, mock_time, requests_mock +): + + register_mock_api( + request_mock=requests_mock, + ) + + default_conf: dict = default_source_config() + pattern: str = re.escape( + "Enable all these flags in recipe: ['native_query_parsing', 'enable_advance_lineage_sql_construct', 'extract_lineage']" + ) + + with pytest.raises(Exception, match=pattern): + + Pipeline.create( + { + "run_id": "powerbi-test", + "source": { + "type": "powerbi", + "config": { + **default_conf, + "extract_column_level_lineage": True, + }, + }, + "sink": { + "type": "file", + "config": { + "filename": f"{tmp_path}/powerbi_cll_mces.json", + }, + }, + } + ) diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py index 81e307a78ae9e6..ff448eca01071f 100644 --- a/metadata-ingestion/tests/integration/snowflake/common.py +++ b/metadata-ingestion/tests/integration/snowflake/common.py @@ -94,7 +94,7 @@ def default_query_results( # noqa: C901 "name": "VIEW_{}".format(view_idx), "created_on": datetime(2021, 6, 8, 0, 0, 0, 0), "comment": "Comment for View", - "text": None, + "text": f"create view view_{view_idx} as select * from table_{view_idx}", } for view_idx in range(1, num_views + 1) ] diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json index a424b258e68ff0..c7273fee5a2e58 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_golden.json @@ -24,7 +24,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -39,7 +40,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -54,7 +56,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -71,7 +74,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -90,7 +94,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -105,7 +110,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -121,7 +127,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -150,7 +157,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -165,7 +173,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -180,7 +189,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -197,7 +207,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -216,7 +227,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -231,7 +243,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -251,7 +264,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -266,7 +280,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -462,7 +477,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -488,7 +504,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -503,7 +520,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -520,7 +538,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -544,7 +563,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -559,7 +579,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -755,7 +776,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -781,7 +803,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -796,7 +819,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -813,7 +837,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -837,7 +862,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -852,7 +878,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1048,7 +1075,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1074,7 +1102,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -1089,7 +1118,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1106,7 +1136,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1130,7 +1161,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1145,7 +1177,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1341,7 +1374,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1367,7 +1401,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -1382,7 +1417,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1399,7 +1435,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1423,7 +1460,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1438,7 +1476,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1634,7 +1673,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1660,7 +1700,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -1675,7 +1716,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1692,7 +1734,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1716,7 +1759,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1731,7 +1775,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1927,7 +1972,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1953,7 +1999,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -1968,7 +2015,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1985,7 +2033,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2009,7 +2058,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2024,7 +2074,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2220,7 +2271,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2246,7 +2298,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -2261,7 +2314,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2278,7 +2332,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2302,7 +2357,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2317,7 +2373,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2513,7 +2570,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2539,7 +2597,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -2554,7 +2613,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2571,7 +2631,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2595,7 +2656,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2610,7 +2672,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2806,7 +2869,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2832,7 +2896,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -2847,7 +2912,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2864,7 +2930,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2888,7 +2955,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2903,7 +2971,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3099,7 +3168,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3125,7 +3195,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -3140,7 +3211,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3157,7 +3229,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3181,7 +3254,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3197,7 +3271,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3212,7 +3287,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3382,7 +3458,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3408,7 +3485,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -3423,7 +3501,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3440,7 +3519,26 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "create view view_1 as select * from table_1", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "snowflake-2023_10_06-17_59_03", + "lastRunId": "no-run-id-provided" } }, { @@ -3464,7 +3562,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3480,7 +3579,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3496,7 +3596,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3512,7 +3613,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3527,7 +3629,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3690,7 +3793,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3716,7 +3820,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "snowflake-2023_08_04-09_52_28" + "runId": "snowflake-2023_08_04-09_52_28", + "lastRunId": "no-run-id-provided" } }, { @@ -3731,7 +3836,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3748,7 +3854,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3773,7 +3880,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3797,7 +3905,26 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "create view view_2 as select * from table_2", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "snowflake-2023_10_06-17_59_03", + "lastRunId": "no-run-id-provided" } }, { @@ -3819,7 +3946,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3841,7 +3969,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3863,7 +3992,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3885,7 +4015,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3907,7 +4038,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3929,7 +4061,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3951,7 +4084,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3973,7 +4107,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3995,7 +4130,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -4017,7 +4153,145 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_1)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_10)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_10)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_2)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_2)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_3)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_3)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_4)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_4)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_5)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_5)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_6)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_6)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_7)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_7)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_8)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_8)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD),col_9)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_1,PROD),col_9)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "snowflake-2023_10_06-17_59_03", + "lastRunId": "no-run-id-provided" } }, { @@ -4034,14 +4308,127 @@ "actor": "urn:li:corpuser:unknown" }, "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_1)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_10)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_10)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_2)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_2)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_3)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_3)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_4)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_4)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_5)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_5)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_6)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_6)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_7)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_7)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_8)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_8)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD),col_9)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.view_2,PROD),col_9)" + ], + "confidenceScore": 1.0 } ] } }, "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "lastObserved": 1615443388097, + "runId": "snowflake-2023_10_06-17_59_03", + "lastRunId": "no-run-id-provided" } }, { @@ -4204,7 +4591,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -4340,7 +4728,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -4476,7 +4865,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -4612,7 +5002,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -4748,7 +5139,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -4884,7 +5276,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5020,7 +5413,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5156,7 +5550,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5292,7 +5687,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5428,7 +5824,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5456,7 +5853,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5484,7 +5882,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5512,7 +5911,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5540,7 +5940,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5568,7 +5969,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5596,7 +5998,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5624,7 +6027,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5652,7 +6056,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5680,7 +6085,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5708,7 +6114,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5736,7 +6143,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5764,7 +6172,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5786,7 +6195,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5808,7 +6218,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5830,7 +6241,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5852,7 +6264,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5874,7 +6287,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5896,7 +6310,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5918,7 +6333,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5940,7 +6356,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5962,7 +6379,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5984,7 +6402,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -5999,7 +6418,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6014,7 +6434,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6029,7 +6450,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6044,7 +6466,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6059,7 +6482,8 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json index 7687b99ac8d6d6..5e55860483d241 100644 --- a/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json +++ b/metadata-ingestion/tests/integration/snowflake/snowflake_privatelink_golden.json @@ -1,13 +1,14 @@ [ { "entityType": "container", - "entityUrn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", + "entityUrn": "urn:li:container:900b1327253068cb1537b1b3c807ddab", "changeType": "UPSERT", "aspectName": "containerProperties", "aspect": { "json": { "customProperties": { "platform": "snowflake", + "instance": "instance1", "env": "PROD", "database": "test_db" }, @@ -24,205 +25,228 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_3,PROD)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "removed": false + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", + "entityUrn": "urn:li:container:900b1327253068cb1537b1b3c807ddab", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "status", "aspect": { "json": { - "platform": "urn:li:dataPlatform:snowflake" + "removed": false } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_3,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "status", "aspect": { "json": { - "typeNames": [ - "Database" - ] + "removed": false } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", + "entityUrn": "urn:li:container:900b1327253068cb1537b1b3c807ddab", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "subTypes", "aspect": { "json": { - "path": [] + "typeNames": [ + "Database" + ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", + "entityUrn": "urn:li:container:900b1327253068cb1537b1b3c807ddab", "changeType": "UPSERT", - "aspectName": "containerProperties", + "aspectName": "browsePathsV2", "aspect": { "json": { - "customProperties": { - "platform": "snowflake", - "env": "PROD", - "database": "test_db", - "schema": "test_schema" - }, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/", - "name": "TEST_SCHEMA", - "description": "comment for TEST_DB.TEST_SCHEMA", - "created": { - "time": 1623110400000 - }, - "lastModified": { - "time": 1623110400000 - } + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + } + ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "container", - "entityUrn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", + "entityUrn": "urn:li:container:900b1327253068cb1537b1b3c807ddab", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "removed": false + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_3,PROD)", "changeType": "UPSERT", - "aspectName": "dataPlatformInstance", + "aspectName": "datasetProperties", "aspect": { "json": { - "platform": "urn:li:dataPlatform:snowflake" + "customProperties": {}, + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/", + "name": "TABLE_3", + "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_3", + "description": "Comment for Table", + "created": { + "time": 1623110400000 + }, + "lastModified": { + "time": 1623110400000 + }, + "tags": [] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_3,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Schema" + "Table" ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_3,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" + }, + { + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "containerProperties", "aspect": { "json": { - "removed": false + "customProperties": { + "platform": "snowflake", + "instance": "instance1", + "env": "PROD", + "database": "test_db", + "schema": "test_schema" + }, + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/", + "name": "TEST_SCHEMA", + "description": "comment for TEST_DB.TEST_SCHEMA", + "created": { + "time": 1623110400000 + }, + "lastModified": { + "time": 1623110400000 + } } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_3,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "test_db.test_schema.table_1", + "schemaName": "test_db.test_schema.table_3", "platform": "urn:li:dataPlatform:snowflake", "version": 0, "created": { @@ -375,94 +399,199 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "status", "aspect": { "json": { - "customProperties": {}, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/", - "name": "TABLE_1", - "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_1", - "description": "Comment for Table", - "created": { - "time": 1623110400000 - }, - "lastModified": { - "time": 1623110400000 - }, - "tags": [] + "removed": false } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_3,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Table" + "Schema" ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:900b1327253068cb1537b1b3c807ddab" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" }, { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "create view view_1 as select * from table_1", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_1/", + "name": "VIEW_1", + "qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_1", + "description": "Comment for View", + "created": { + "time": 1623110400000 + }, + "lastModified": { + "time": 1623110400000 + }, + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -472,17 +601,18 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "test_db.test_schema.table_2", + "schemaName": "test_db.test_schema.table_1", "platform": "urn:li:dataPlatform:snowflake", "version": 0, "created": { @@ -635,114 +765,108 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)", "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "subTypes", "aspect": { "json": { - "customProperties": {}, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/", - "name": "TABLE_2", - "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_2", - "description": "Comment for Table", - "created": { - "time": 1623110400000 - }, - "lastModified": { - "time": 1623110400000 - }, - "tags": [] + "typeNames": [ + "View" + ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "datasetProperties", "aspect": { "json": { - "typeNames": [ - "Table" - ] + "customProperties": {}, + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_1/", + "name": "TABLE_1", + "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_1", + "description": "Comment for Table", + "created": { + "time": 1623110400000 + }, + "lastModified": { + "time": 1623110400000 + }, + "tags": [] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" }, { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "schemaMetadata", "aspect": { "json": { - "removed": false - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)", - "changeType": "UPSERT", - "aspectName": "schemaMetadata", - "aspect": { - "json": { - "schemaName": "test_db.test_schema.table_3", + "schemaName": "test_db.test_schema.view_1", "platform": "urn:li:dataPlatform:snowflake", "version": 0, "created": { @@ -895,94 +1019,263 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)", "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "subTypes", "aspect": { "json": { - "customProperties": {}, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_3/", - "name": "TABLE_3", - "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_3", - "description": "Comment for Table", - "created": { - "time": 1623110400000 - }, - "lastModified": { - "time": 1623110400000 - }, - "tags": [] + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" + }, + { + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "typeNames": [ - "Table" - ] + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "upstreamLineage", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" - }, + "upstreams": [ { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_1)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_10)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_10)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_2)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_2)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_3)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_3)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_4)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_4)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_5)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_5)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_6)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_6)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_7)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_7)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_8)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_8)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_1,PROD),col_9)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_1,PROD),col_9)" + ], + "confidenceScore": 1.0 } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_10,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_10,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -992,17 +1285,18 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "test_db.test_schema.table_4", + "schemaName": "test_db.test_schema.table_2", "platform": "urn:li:dataPlatform:snowflake", "version": 0, "created": { @@ -1155,20 +1449,21 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_10,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/", - "name": "TABLE_4", - "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_4", + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/", + "name": "TABLE_10", + "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_10", "description": "Comment for Table", "created": { "time": 1623110400000 @@ -1181,68 +1476,13 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" - }, - { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_5,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1252,12 +1492,13 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_5,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { @@ -1415,12 +1656,45 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_5,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_5,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { @@ -1441,27 +1715,58 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)", "changeType": "UPSERT", - "aspectName": "container", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_2/", + "name": "TABLE_2", + "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_2", + "description": "Comment for Table", + "created": { + "time": 1623110400000 + }, + "lastModified": { + "time": 1623110400000 + }, + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_10,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", "aspect": { "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "typeNames": [ + "Table" + ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_5,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1473,75 +1778,159 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_5,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" }, { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_10,PROD)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "browsePathsV2", "aspect": { "json": { - "removed": false + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" + }, + { + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_5,PROD)", "changeType": "UPSERT", - "aspectName": "schemaMetadata", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "schemaName": "test_db.test_schema.table_6", "platform": "urn:li:dataPlatform:snowflake", - "version": 0, - "created": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "lastModified": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "hash": "", - "platformSchema": { - "com.linkedin.schema.MySqlDDL": { - "tableSchema": "" - } - }, - "fields": [ + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ { - "fieldPath": "col_1", + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" + }, + { + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_10,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "test_db.test_schema.table_10", + "platform": "urn:li:dataPlatform:snowflake", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "col_1", "nullable": false, "description": "Comment for column", "type": { @@ -1675,94 +2064,13 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)", - "changeType": "UPSERT", - "aspectName": "datasetProperties", - "aspect": { - "json": { - "customProperties": {}, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/", - "name": "TABLE_6", - "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_6", - "description": "Comment for Table", - "created": { - "time": 1623110400000 - }, - "lastModified": { - "time": 1623110400000 - }, - "tags": [] - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" - }, - { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_6,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -1772,17 +2080,18 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_6,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "test_db.test_schema.table_7", + "schemaName": "test_db.test_schema.table_6", "platform": "urn:li:dataPlatform:snowflake", "version": 0, "created": { @@ -1935,20 +2244,54 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_6,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_6,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/", - "name": "TABLE_7", - "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_7", + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_6/", + "name": "TABLE_6", + "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_6", "description": "Comment for Table", "created": { "time": 1623110400000 @@ -1961,27 +2304,46 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_10,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_7,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_6,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -1993,36 +2355,75 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_6,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" }, { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_7,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_6,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_4,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2032,17 +2433,18 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_4,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "test_db.test_schema.table_8", + "schemaName": "test_db.test_schema.table_4", "platform": "urn:li:dataPlatform:snowflake", "version": 0, "created": { @@ -2195,20 +2597,21 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_7,PROD)", "changeType": "UPSERT", "aspectName": "datasetProperties", "aspect": { "json": { "customProperties": {}, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/", - "name": "TABLE_8", - "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_8", + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_7/", + "name": "TABLE_7", + "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_7", "description": "Comment for Table", "created": { "time": 1623110400000 @@ -2221,27 +2624,56 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_4,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_4,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_4/", + "name": "TABLE_4", + "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_4", + "description": "Comment for Table", + "created": { + "time": 1623110400000 + }, + "lastModified": { + "time": 1623110400000 + }, + "tags": [] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_7,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -2253,56 +2685,94 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_7,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { "json": { "path": [ { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" }, { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_4,PROD)", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "subTypes", "aspect": { "json": { - "removed": false + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_4,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" + }, + { + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_7,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "test_db.test_schema.table_9", + "schemaName": "test_db.test_schema.table_7", "platform": "urn:li:dataPlatform:snowflake", "version": 0, "created": { @@ -2455,96 +2925,15 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_8,PROD)", "changeType": "UPSERT", - "aspectName": "datasetProperties", - "aspect": { - "json": { - "customProperties": {}, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/", - "name": "TABLE_9", - "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_9", - "description": "Comment for Table", - "created": { - "time": 1623110400000 - }, - "lastModified": { - "time": 1623110400000 - }, - "tags": [] - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)", - "changeType": "UPSERT", - "aspectName": "container", - "aspect": { - "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)", - "changeType": "UPSERT", - "aspectName": "subTypes", - "aspect": { - "json": { - "typeNames": [ - "Table" - ] - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" - }, - { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)", - "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "status", "aspect": { "json": { "removed": false @@ -2552,17 +2941,18 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_8,PROD)", "changeType": "UPSERT", "aspectName": "schemaMetadata", "aspect": { "json": { - "schemaName": "test_db.test_schema.table_10", + "schemaName": "test_db.test_schema.table_8", "platform": "urn:li:dataPlatform:snowflake", "version": 0, "created": { @@ -2715,329 +3105,877 @@ }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_4,PROD)", "changeType": "UPSERT", - "aspectName": "datasetProperties", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "customProperties": {}, - "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_10/", - "name": "TABLE_10", - "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_10", - "description": "Comment for Table", - "created": { - "time": 1623110400000 - }, - "lastModified": { - "time": 1623110400000 - }, - "tags": [] + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_8,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { "json": { - "container": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_8,PROD)", "changeType": "UPSERT", - "aspectName": "subTypes", + "aspectName": "datasetProperties", "aspect": { "json": { - "typeNames": [ - "Table" - ] + "customProperties": {}, + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_8/", + "name": "TABLE_8", + "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_8", + "description": "Comment for Table", + "created": { + "time": 1623110400000 + }, + "lastModified": { + "time": 1623110400000 + }, + "tags": [] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_7,PROD)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "container", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:5e359958be02ce647cd9ac196dbd4585", - "urn": "urn:li:container:5e359958be02ce647cd9ac196dbd4585" - }, - { - "id": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c", - "urn": "urn:li:container:94c696a054bab40b73e640a7f82e3b1c" - } - ] + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_1,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_9,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" - } - ] + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_8,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "subTypes", "aspect": { "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" - } + "typeNames": [ + "Table" ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_3,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_8,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "browsePathsV2", "aspect": { "json": { - "upstreams": [ + "path": [ { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_4,PROD)", - "changeType": "UPSERT", - "aspectName": "upstreamLineage", - "aspect": { - "json": { - "upstreams": [ + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" + }, + { + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_5,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_9,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "status", "aspect": { "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" - } - ] + "removed": false } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_6,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_8,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "dataPlatformInstance", "aspect": { "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" - } - ] + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_7,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_9,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "datasetProperties", "aspect": { "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" - } - ] + "customProperties": {}, + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/table/TABLE_9/", + "name": "TABLE_9", + "qualifiedName": "TEST_DB.TEST_SCHEMA.TABLE_9", + "description": "Comment for Table", + "created": { + "time": 1623110400000 + }, + "lastModified": { + "time": 1623110400000 + }, + "tags": [] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_8,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_9,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "subTypes", "aspect": { "json": { - "upstreams": [ - { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" - } + "typeNames": [ + "Table" ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_9,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_9,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "browsePathsV2", "aspect": { "json": { - "upstreams": [ + "path": [ { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" - }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" + }, + { + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_10,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_9,PROD)", "changeType": "UPSERT", - "aspectName": "upstreamLineage", + "aspectName": "schemaMetadata", "aspect": { "json": { - "upstreams": [ + "schemaName": "test_db.test_schema.table_9", + "platform": "urn:li:dataPlatform:snowflake", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ { - "auditStamp": { - "time": 0, - "actor": "urn:li:corpuser:unknown" + "fieldPath": "col_1", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,test_db.test_schema.table_2,PROD)", - "type": "TRANSFORMED" + "nativeDataType": "NUMBER(38,0)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_2", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_3", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_4", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_5", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_6", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_7", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_8", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_9", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_10", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_9,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "create view view_2 as select * from table_2", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "dataPlatformInstance", + "aspect": { + "json": { + "platform": "urn:li:dataPlatform:snowflake", + "instance": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "status", + "aspect": { + "json": { + "removed": false + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "datasetProperties", + "aspect": { + "json": { + "customProperties": {}, + "externalUrl": "https://app.abc12345.ap-south-1.privatelink.snowflakecomputing.com/#/data/databases/TEST_DB/schemas/TEST_SCHEMA/view/VIEW_2/", + "name": "VIEW_2", + "qualifiedName": "TEST_DB.TEST_SCHEMA.VIEW_2", + "description": "Comment for View", + "created": { + "time": 1623110400000 + }, + "lastModified": { + "time": 1623110400000 + }, + "tags": [] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "View" + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)", + "urn": "urn:li:dataPlatformInstance:(urn:li:dataPlatform:snowflake,instance1)" + }, + { + "id": "urn:li:container:900b1327253068cb1537b1b3c807ddab", + "urn": "urn:li:container:900b1327253068cb1537b1b3c807ddab" + }, + { + "id": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f", + "urn": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "schemaMetadata", + "aspect": { + "json": { + "schemaName": "test_db.test_schema.view_2", + "platform": "urn:li:dataPlatform:snowflake", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "col_1", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.NumberType": {} + } + }, + "nativeDataType": "NUMBER(38,0)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_2", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_3", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_4", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_5", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_6", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_7", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_8", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_9", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "col_10", + "nullable": false, + "description": "Comment for column", + "type": { + "type": { + "com.linkedin.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(255)", + "recursive": false, + "isPartOfKey": false + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eac598ee71ef1b5e24448d650c08aa5f" + } + }, + "systemMetadata": { + "lastObserved": 1654621200000, + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_1)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_1)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_10)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_10)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_2)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_2)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_3)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_3)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_4)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_4)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_5)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_5)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_6)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_6)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_7)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_7)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_8)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_8)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.table_2,PROD),col_9)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:snowflake,instance1.test_db.test_schema.view_2,PROD),col_9)" + ], + "confidenceScore": 1.0 } ] } }, "systemMetadata": { "lastObserved": 1654621200000, - "runId": "snowflake-2022_06_07-17_00_00" + "runId": "snowflake-2022_06_07-17_00_00", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py index dec50aefd19f04..4c00e48ede9fbd 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py @@ -30,6 +30,8 @@ from tests.integration.snowflake.common import FROZEN_TIME, default_query_results from tests.test_helpers import mce_helpers +pytestmark = pytest.mark.integration_batch_2 + def random_email(): return ( @@ -55,7 +57,6 @@ def random_cloud_region(): ) -@pytest.mark.integration def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" @@ -124,6 +125,7 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): validate_upstreams_against_patterns=False, include_operational_stats=True, email_as_user_identifier=True, + incremental_lineage=False, start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc ), @@ -183,7 +185,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph): @freeze_time(FROZEN_TIME) -@pytest.mark.integration def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_graph): test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake" @@ -210,10 +211,12 @@ def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_ include_technical_schema=True, include_table_lineage=True, include_column_lineage=False, - include_views=False, - include_view_lineage=False, + include_views=True, + include_view_lineage=True, include_usage_stats=False, + incremental_lineage=False, include_operational_stats=False, + platform_instance="instance1", start_time=datetime(2022, 6, 6, 0, 0, 0, 0).replace( tzinfo=timezone.utc ), diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py index bba53c1e97a478..4b0dd2b1045a37 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py @@ -283,10 +283,13 @@ def test_snowflake_unexpected_snowflake_view_lineage_error_causes_pipeline_warni ) snowflake_pipeline_config1 = snowflake_pipeline_config.copy() - cast( + config = cast( SnowflakeV2Config, cast(PipelineConfig, snowflake_pipeline_config1).source.config, - ).include_view_lineage = True + ) + config.include_table_lineage = True + config.include_view_lineage = True + pipeline = Pipeline(snowflake_pipeline_config1) pipeline.run() pipeline.raise_from_status() # pipeline should not fail diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py index f72bd5b72d2cd0..7e2ac94fa4e35c 100644 --- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py +++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_stateful.py @@ -31,6 +31,7 @@ def stateful_pipeline_config(include_tables: bool) -> PipelineConfig: match_fully_qualified_names=True, schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]), include_tables=include_tables, + incremental_lineage=False, stateful_ingestion=StatefulStaleMetadataRemovalConfig.parse_obj( { "enabled": True, @@ -49,7 +50,7 @@ def stateful_pipeline_config(include_tables: bool) -> PipelineConfig: @freeze_time(FROZEN_TIME) -def test_tableau_stateful(mock_datahub_graph): +def test_stale_metadata_removal(mock_datahub_graph): with mock.patch( "datahub.ingestion.source.state_provider.datahub_ingestion_checkpointing_provider.DataHubGraph", mock_datahub_graph, diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json index a495d04c4e398c..2fe7a76fd01ae6 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_to_file.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,24 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -80,7 +100,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -91,11 +112,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_id": "1f2f14ba-db84-4fa1-910e-7df71bede642", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2023-03-10 16:27:54.970000", - "date_modified": "2023-03-10 16:27:55.097000", + "date_created": "2023-10-27 10:11:55.540000", + "date_modified": "2023-10-27 10:11:55.667000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -110,7 +131,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -127,22 +149,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -163,7 +171,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -178,7 +187,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -193,7 +203,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -210,7 +221,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -225,7 +237,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -245,7 +258,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -266,7 +280,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -281,7 +296,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -296,7 +312,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -313,7 +330,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -328,7 +346,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -348,7 +367,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -369,7 +389,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -384,7 +405,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -399,7 +421,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -416,7 +439,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -431,7 +455,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -451,7 +476,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -472,7 +498,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -487,7 +514,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -502,7 +530,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -519,7 +548,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -534,7 +564,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -554,7 +585,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -575,7 +607,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -590,7 +623,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -605,7 +639,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -622,7 +657,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -637,7 +673,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -657,7 +694,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +716,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -693,7 +732,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -708,7 +748,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -725,7 +766,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -740,7 +782,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -760,7 +803,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -781,7 +825,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -796,7 +841,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -811,7 +857,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -828,7 +875,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -843,7 +891,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -863,7 +912,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -884,7 +934,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -899,7 +950,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -914,7 +966,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -931,7 +984,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -946,7 +1000,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -966,7 +1021,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -987,7 +1043,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1002,7 +1059,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1017,7 +1075,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1034,7 +1093,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1049,7 +1109,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1069,7 +1130,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1090,7 +1152,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1105,7 +1168,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1120,7 +1184,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1137,7 +1202,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1152,7 +1218,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1172,7 +1239,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1187,7 +1255,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1259,7 +1328,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1276,7 +1346,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1300,7 +1371,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1321,7 +1393,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1336,7 +1409,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1351,7 +1425,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1368,7 +1443,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1383,7 +1459,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1403,7 +1480,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1418,7 +1496,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1491,7 +1570,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1508,7 +1588,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1532,7 +1613,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1547,7 +1629,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1644,7 +1727,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1661,7 +1745,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1685,7 +1770,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1700,7 +1786,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1796,7 +1883,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1813,7 +1901,33 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", + "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" + }, + { + "id": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671", + "urn": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1830,12 +1944,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -1843,14 +1958,14 @@ "customProperties": { "procedure_depends_on": "{}", "depending_on_procedure": "{}", - "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2023-03-10 16:27:54.907000", - "date_modified": "2023-03-10 16:27:54.907000" + "date_created": "2023-10-27 10:11:55.460000", + "date_modified": "2023-10-27 10:11:55.460000" }, "externalUrl": "", - "name": "demodata.Foo.DBs", + "name": "demodata.Foo.Proc.With.SpecialChar", "type": { "string": "MSSQL_STORED_PROCEDURE" } @@ -1858,12 +1973,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -1875,31 +1991,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", - "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" - }, - { - "id": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671", - "urn": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1920,7 +2013,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1935,7 +2029,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1950,7 +2045,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1967,7 +2063,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1982,7 +2079,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2002,7 +2100,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2023,7 +2122,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2038,7 +2138,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2053,7 +2154,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2070,7 +2172,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2085,7 +2188,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2105,7 +2209,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2126,7 +2231,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2141,7 +2247,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2156,7 +2263,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2173,7 +2281,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2188,7 +2297,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2208,7 +2318,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2228,7 +2339,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2243,7 +2355,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2258,7 +2371,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2275,7 +2389,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2290,7 +2405,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2311,7 +2427,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2326,7 +2443,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2341,7 +2459,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2358,7 +2477,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2373,7 +2493,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2393,7 +2514,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2414,7 +2536,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2429,7 +2552,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2444,7 +2568,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2461,7 +2586,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2476,7 +2602,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2496,7 +2623,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2517,7 +2645,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2532,7 +2661,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2547,7 +2677,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2564,7 +2695,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2579,7 +2711,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2599,7 +2732,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2620,7 +2754,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2635,7 +2770,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2650,7 +2786,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2667,7 +2804,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2682,7 +2820,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2702,7 +2841,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2723,7 +2863,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2738,7 +2879,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2753,7 +2895,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2770,7 +2913,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2785,7 +2929,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2805,7 +2950,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2826,7 +2972,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2841,7 +2988,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2856,7 +3004,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2873,7 +3022,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2888,7 +3038,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2908,7 +3059,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2929,7 +3081,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2944,7 +3097,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2959,7 +3113,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2976,7 +3131,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2991,7 +3147,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3011,7 +3168,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3032,7 +3190,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3047,7 +3206,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3062,7 +3222,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3079,7 +3240,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3094,7 +3256,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3114,7 +3277,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3135,7 +3299,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3150,7 +3315,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3165,7 +3331,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3182,7 +3349,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3197,7 +3365,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3217,7 +3386,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3238,7 +3408,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3253,7 +3424,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3268,7 +3440,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3285,7 +3458,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3300,7 +3474,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3320,7 +3495,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3335,7 +3511,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3407,7 +3584,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3424,7 +3602,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3448,7 +3627,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3469,7 +3649,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3484,7 +3665,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3499,7 +3681,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3516,7 +3699,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3531,7 +3715,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3551,7 +3736,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3566,7 +3752,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3638,7 +3825,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3655,7 +3843,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3679,7 +3868,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3694,7 +3884,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3790,7 +3981,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3807,7 +3999,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3831,7 +4024,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3852,7 +4046,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3867,7 +4062,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3882,7 +4078,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3899,7 +4096,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3914,7 +4112,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3934,7 +4133,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3955,7 +4155,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3970,7 +4171,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -3985,7 +4187,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -4002,7 +4205,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -4017,7 +4221,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -4037,7 +4242,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -4058,7 +4264,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -4073,7 +4280,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -4088,7 +4296,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -4105,7 +4314,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -4120,27 +4330,34 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:c6627af82d44de89492e1a9315ae9f4b", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "browsePathsV2", "aspect": { "json": { - "removed": false + "path": [ + { + "id": "urn:li:container:9447d283fb4f95ce7474f1db0179bb59", + "urn": "urn:li:container:9447d283fb4f95ce7474f1db0179bb59" + } + ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -4150,12 +4367,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -4165,12 +4383,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -4180,27 +4399,24 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:c6627af82d44de89492e1a9315ae9f4b", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:9447d283fb4f95ce7474f1db0179bb59", - "urn": "urn:li:container:9447d283fb4f95ce7474f1db0179bb59" - } - ] + "removed": false } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json index 8277ff8bf7e89a..c1984828750eb5 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_no_db_with_filter.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,24 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -80,7 +100,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -91,11 +112,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_id": "1f2f14ba-db84-4fa1-910e-7df71bede642", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2023-03-10 16:27:54.970000", - "date_modified": "2023-03-10 16:27:55.097000", + "date_created": "2023-10-27 10:11:55.540000", + "date_modified": "2023-10-27 10:11:55.667000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -110,7 +131,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -127,22 +149,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -163,7 +171,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -178,7 +187,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -193,7 +203,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -210,7 +221,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -225,7 +237,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -245,7 +258,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -266,7 +280,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -281,7 +296,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -296,7 +312,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -313,7 +330,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -328,7 +346,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -348,7 +367,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -369,7 +389,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -384,7 +405,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -399,7 +421,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -416,7 +439,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -431,7 +455,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -451,7 +476,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -472,7 +498,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -487,7 +514,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -502,7 +530,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -519,7 +548,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -534,7 +564,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -554,7 +585,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -575,7 +607,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -590,7 +623,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -605,7 +639,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -622,7 +657,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -637,7 +673,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -657,7 +694,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +716,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -693,7 +732,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -708,7 +748,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -725,7 +766,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -740,7 +782,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -760,7 +803,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -781,7 +825,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -796,7 +841,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -811,7 +857,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -828,7 +875,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -843,7 +891,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -863,7 +912,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -884,7 +934,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -899,7 +950,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -914,7 +966,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -931,7 +984,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -946,7 +1000,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -966,7 +1021,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -987,7 +1043,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1002,7 +1059,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1017,7 +1075,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1034,7 +1093,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1049,7 +1109,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1069,7 +1130,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1090,7 +1152,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1105,7 +1168,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1120,7 +1184,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1137,7 +1202,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1152,7 +1218,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1172,7 +1239,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1187,7 +1255,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1259,7 +1328,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1276,7 +1346,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1300,7 +1371,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1321,7 +1393,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1336,7 +1409,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1351,7 +1425,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1368,7 +1443,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1383,7 +1459,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1403,7 +1480,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1418,7 +1496,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1491,7 +1570,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1508,7 +1588,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1532,7 +1613,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1547,7 +1629,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1644,7 +1727,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1661,7 +1745,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1685,7 +1770,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1700,7 +1786,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1796,7 +1883,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1813,7 +1901,33 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", + "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" + }, + { + "id": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671", + "urn": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1830,12 +1944,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -1843,14 +1958,14 @@ "customProperties": { "procedure_depends_on": "{}", "depending_on_procedure": "{}", - "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2023-03-10 16:27:54.907000", - "date_modified": "2023-03-10 16:27:54.907000" + "date_created": "2023-10-27 10:11:55.460000", + "date_modified": "2023-10-27 10:11:55.460000" }, "externalUrl": "", - "name": "demodata.Foo.DBs", + "name": "demodata.Foo.Proc.With.SpecialChar", "type": { "string": "MSSQL_STORED_PROCEDURE" } @@ -1858,12 +1973,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -1875,31 +1991,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoData.Foo.SalesReason,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", - "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" - }, - { - "id": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671", - "urn": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1920,7 +2013,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1935,7 +2029,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1950,7 +2045,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1967,7 +2063,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1982,7 +2079,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2002,7 +2100,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2023,7 +2122,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2038,7 +2138,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2053,7 +2154,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2070,7 +2172,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2085,7 +2188,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2105,7 +2209,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2126,7 +2231,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2141,7 +2247,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2156,7 +2263,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2173,7 +2281,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2188,27 +2297,34 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "browsePathsV2", "aspect": { "json": { - "removed": false + "path": [ + { + "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", + "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" + } + ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2218,12 +2334,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2233,12 +2350,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2248,27 +2366,24 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", - "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" - } - ] + "removed": false } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json index f3714bba6364d0..804a8d74d0d512 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_to_file.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,24 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "container", + "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -80,7 +100,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -91,11 +112,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "1df94c0f-15fd-4b68-8ca3-6053a0332362", + "job_id": "1f2f14ba-db84-4fa1-910e-7df71bede642", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2023-03-10 16:27:54.970000", - "date_modified": "2023-03-10 16:27:55.097000", + "date_created": "2023-10-27 10:11:55.540000", + "date_modified": "2023-10-27 10:11:55.667000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -110,7 +131,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -127,22 +149,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" - } -}, -{ - "entityType": "container", - "entityUrn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -163,7 +171,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -178,7 +187,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -193,7 +203,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -210,7 +221,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -225,7 +237,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -245,7 +258,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -266,7 +280,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -281,7 +296,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -296,7 +312,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -313,7 +330,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -328,7 +346,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -348,7 +367,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -369,7 +389,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -384,7 +405,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -399,7 +421,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -416,7 +439,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -431,7 +455,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -451,7 +476,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -472,7 +498,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -487,7 +514,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -502,7 +530,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -519,7 +548,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -534,7 +564,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -554,7 +585,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -575,7 +607,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -590,7 +623,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -605,7 +639,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -622,7 +657,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -637,7 +673,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -657,7 +694,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +716,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -693,7 +732,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -708,7 +748,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -725,7 +766,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -740,7 +782,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -760,7 +803,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -781,7 +825,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -796,7 +841,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -811,7 +857,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -828,7 +875,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -843,7 +891,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -863,7 +912,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -884,7 +934,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -899,7 +950,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -914,7 +966,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -931,7 +984,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -946,7 +1000,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -966,7 +1021,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -987,7 +1043,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1002,7 +1059,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1017,7 +1075,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1034,7 +1093,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1049,7 +1109,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1069,7 +1130,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1090,7 +1152,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1105,7 +1168,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1120,7 +1184,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1137,7 +1202,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1152,7 +1218,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1172,7 +1239,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1187,7 +1255,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1259,7 +1328,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1276,7 +1346,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1300,7 +1371,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1321,7 +1393,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1336,7 +1409,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1351,7 +1425,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1368,7 +1443,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1383,7 +1459,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1403,7 +1480,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1418,7 +1496,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1491,7 +1570,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1508,7 +1588,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1532,7 +1613,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1547,7 +1629,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1644,7 +1727,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1661,7 +1745,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1685,7 +1770,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1700,7 +1786,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1796,7 +1883,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1813,7 +1901,33 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoDataAlias.Foo.SalesReason,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", + "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" + }, + { + "id": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671", + "urn": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1615443388097, + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1830,12 +1944,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -1843,14 +1958,14 @@ "customProperties": { "procedure_depends_on": "{}", "depending_on_procedure": "{}", - "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2023-03-10 16:27:54.907000", - "date_modified": "2023-03-10 16:27:54.907000" + "date_created": "2023-10-27 10:11:55.460000", + "date_modified": "2023-10-27 10:11:55.460000" }, "externalUrl": "", - "name": "demodata.Foo.DBs", + "name": "demodata.Foo.Proc.With.SpecialChar", "type": { "string": "MSSQL_STORED_PROCEDURE" } @@ -1858,12 +1973,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -1875,31 +1991,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" - } -}, -{ - "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:mssql,DemoDataAlias.Foo.SalesReason,PROD)", - "changeType": "UPSERT", - "aspectName": "browsePathsV2", - "aspect": { - "json": { - "path": [ - { - "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", - "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" - }, - { - "id": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671", - "urn": "urn:li:container:6e5c6d608d0a2dcc4eb03591382e5671" - } - ] - } - }, - "systemMetadata": { - "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1920,7 +2013,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1935,7 +2029,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1950,7 +2045,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1967,7 +2063,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1982,7 +2079,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2002,7 +2100,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2023,7 +2122,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2038,7 +2138,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2053,7 +2154,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2070,7 +2172,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2085,7 +2188,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2105,7 +2209,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2126,7 +2231,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2141,7 +2247,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2156,7 +2263,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2173,7 +2281,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2188,27 +2297,34 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", + "entityType": "container", + "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", "changeType": "UPSERT", - "aspectName": "status", + "aspectName": "browsePathsV2", "aspect": { "json": { - "removed": false + "path": [ + { + "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", + "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" + } + ] } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataFlow", - "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2218,12 +2334,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", + "entityType": "dataFlow", + "entityUrn": "urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2233,12 +2350,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.Weekly Demo Data Backup,PROD),localhost.Weekly Demo Data Backup)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2248,27 +2366,24 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { - "entityType": "container", - "entityUrn": "urn:li:container:3f157d8292fb473142f19e2250af537f", + "entityType": "dataJob", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", - "aspectName": "browsePathsV2", + "aspectName": "status", "aspect": { "json": { - "path": [ - { - "id": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5", - "urn": "urn:li:container:b7062d1c0c650d9de0f7a9a5de00b1b5" - } - ] + "removed": false } }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json index d25d23daae2eac..9d1b288057a160 100644 --- a/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json +++ b/metadata-ingestion/tests/integration/sql_server/golden_files/golden_mces_mssql_with_lower_case_urn.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -95,7 +100,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -106,11 +112,11 @@ "aspect": { "json": { "customProperties": { - "job_id": "b6a0c1e2-f90a-4c86-a226-bf7ca59ad79f", + "job_id": "1f2f14ba-db84-4fa1-910e-7df71bede642", "job_name": "Weekly Demo Data Backup", "description": "No description available.", - "date_created": "2023-08-06 21:01:05.157000", - "date_modified": "2023-08-06 21:01:05.283000", + "date_created": "2023-10-27 10:11:55.540000", + "date_modified": "2023-10-27 10:11:55.667000", "step_id": "1", "step_name": "Set database to read only", "subsystem": "TSQL", @@ -125,7 +131,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -142,7 +149,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -163,7 +171,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -178,7 +187,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -193,7 +203,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -210,7 +221,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -225,7 +237,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -245,7 +258,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -266,7 +280,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -281,7 +296,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -296,7 +312,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -313,7 +330,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -328,7 +346,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -348,7 +367,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -369,7 +389,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -384,7 +405,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -399,7 +421,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -416,7 +439,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -431,7 +455,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -451,7 +476,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -472,7 +498,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -487,7 +514,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -502,7 +530,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -519,7 +548,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -534,7 +564,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -554,7 +585,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -575,7 +607,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -590,7 +623,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -605,7 +639,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -622,7 +657,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -637,7 +673,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -657,7 +694,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -678,7 +716,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -693,7 +732,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -708,7 +748,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -725,7 +766,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -740,7 +782,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -760,7 +803,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -781,7 +825,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -796,7 +841,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -811,7 +857,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -828,7 +875,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -843,7 +891,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -863,7 +912,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -884,7 +934,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -899,7 +950,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -914,7 +966,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -931,7 +984,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -946,7 +1000,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -966,7 +1021,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -987,7 +1043,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1002,7 +1059,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1017,7 +1075,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1034,7 +1093,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1049,7 +1109,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1069,7 +1130,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1090,7 +1152,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1105,7 +1168,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1120,7 +1184,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1137,7 +1202,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1152,7 +1218,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1172,7 +1239,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1187,7 +1255,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1259,7 +1328,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1276,7 +1346,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1300,7 +1371,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1321,7 +1393,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1336,7 +1409,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1351,7 +1425,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1368,7 +1443,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1383,7 +1459,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1403,7 +1480,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1418,7 +1496,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1491,7 +1570,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1508,7 +1588,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1532,7 +1613,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1547,7 +1629,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1644,7 +1727,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1661,7 +1745,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1685,7 +1770,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1700,7 +1786,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1796,7 +1883,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1813,7 +1901,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1837,7 +1926,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1854,12 +1944,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInfo", "aspect": { @@ -1867,14 +1958,14 @@ "customProperties": { "procedure_depends_on": "{}", "depending_on_procedure": "{}", - "code": "CREATE PROCEDURE Foo.DBs @ID INT\nAS\n SELECT @ID AS ThatDB;\n", + "code": "CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT\nAS\n SELECT @ID AS ThatDB;\n", "input parameters": "['@ID']", "parameter @ID": "{'type': 'int'}", - "date_created": "2023-08-06 21:01:05.093000", - "date_modified": "2023-08-06 21:01:05.093000" + "date_created": "2023-10-27 10:11:55.460000", + "date_modified": "2023-10-27 10:11:55.460000" }, "externalUrl": "", - "name": "demodata.Foo.DBs", + "name": "demodata.Foo.Proc.With.SpecialChar", "type": { "string": "MSSQL_STORED_PROCEDURE" } @@ -1882,12 +1973,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "dataJobInputOutput", "aspect": { @@ -1899,7 +1991,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1920,7 +2013,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1935,7 +2029,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1950,7 +2045,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1967,7 +2063,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1982,7 +2079,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2002,7 +2100,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2023,7 +2122,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2038,7 +2138,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2053,7 +2154,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2070,7 +2172,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2085,7 +2188,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2105,7 +2209,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2126,7 +2231,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2141,7 +2247,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2156,7 +2263,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2173,7 +2281,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2188,7 +2297,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2208,7 +2318,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2223,7 +2334,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2238,7 +2350,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { @@ -2253,12 +2366,13 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataJob", - "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),DBs)", + "entityUrn": "urn:li:dataJob:(urn:li:dataFlow:(mssql,localhost.demodata.Foo.stored_procedures,PROD),Proc.With.SpecialChar)", "changeType": "UPSERT", "aspectName": "status", "aspect": { @@ -2268,7 +2382,8 @@ }, "systemMetadata": { "lastObserved": 1615443388097, - "runId": "mssql-test" + "runId": "mssql-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql index c1347a7c8cacaf..a17d52f9a39b1b 100644 --- a/metadata-ingestion/tests/integration/sql_server/setup/setup.sql +++ b/metadata-ingestion/tests/integration/sql_server/setup/setup.sql @@ -45,7 +45,7 @@ CREATE TABLE Foo.SalesReason ) ; GO -CREATE PROCEDURE Foo.DBs @ID INT +CREATE PROCEDURE [Foo].[Proc.With.SpecialChar] @ID INT AS SELECT @ID AS ThatDB; GO diff --git a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py index 099690fed34c27..f439a322c26771 100644 --- a/metadata-ingestion/tests/integration/sql_server/test_sql_server.py +++ b/metadata-ingestion/tests/integration/sql_server/test_sql_server.py @@ -6,7 +6,7 @@ from tests.test_helpers import mce_helpers from tests.test_helpers.click_helpers import run_datahub_cmd -from tests.test_helpers.docker_helpers import wait_for_port +from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port @pytest.fixture(scope="module") @@ -29,6 +29,9 @@ def mssql_runner(docker_compose_runner, pytestconfig): assert ret.returncode == 0 yield docker_services + # The image is pretty large, so we remove it after the test. + cleanup_image("mcr.microsoft.com/mssql/server") + SOURCE_FILES_PATH = "./tests/integration/sql_server/source_files" config_file = os.listdir(SOURCE_FILES_PATH) diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py index 53b8519a886d3b..0510f4a40f6597 100644 --- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py +++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py @@ -20,7 +20,7 @@ from datahub.ingestion.source.tableau import TableauConfig, TableauSource from datahub.ingestion.source.tableau_common import ( TableauLineageOverrides, - make_table_urn, + TableauUpstreamReference, ) from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageType, @@ -546,13 +546,13 @@ def test_lineage_overrides(): enable_logging() # Simple - specify platform instance to presto table assert ( - make_table_urn( - DEFAULT_ENV, + TableauUpstreamReference( "presto_catalog", - "presto", "test-schema", - "presto_catalog.test-schema.test-table", - platform_instance_map={"presto": "my_presto_instance"}, + "test-table", + "presto", + ).make_dataset_urn( + env=DEFAULT_ENV, platform_instance_map={"presto": "my_presto_instance"} ) == "urn:li:dataset:(urn:li:dataPlatform:presto,my_presto_instance.presto_catalog.test-schema.test-table,PROD)" ) @@ -560,12 +560,13 @@ def test_lineage_overrides(): # Transform presto urn to hive urn # resulting platform instance for hive = mapped platform instance + presto_catalog assert ( - make_table_urn( - DEFAULT_ENV, + TableauUpstreamReference( "presto_catalog", - "presto", "test-schema", - "presto_catalog.test-schema.test-table", + "test-table", + "presto", + ).make_dataset_urn( + env=DEFAULT_ENV, platform_instance_map={"presto": "my_instance"}, lineage_overrides=TableauLineageOverrides( platform_override_map={"presto": "hive"}, @@ -574,14 +575,15 @@ def test_lineage_overrides(): == "urn:li:dataset:(urn:li:dataPlatform:hive,my_instance.presto_catalog.test-schema.test-table,PROD)" ) - # tranform hive urn to presto urn + # transform hive urn to presto urn assert ( - make_table_urn( - DEFAULT_ENV, - "", - "hive", + TableauUpstreamReference( + None, "test-schema", - "test-schema.test-table", + "test-table", + "hive", + ).make_dataset_urn( + env=DEFAULT_ENV, platform_instance_map={"hive": "my_presto_instance.presto_catalog"}, lineage_overrides=TableauLineageOverrides( platform_override_map={"hive": "presto"}, @@ -757,7 +759,7 @@ def test_tableau_no_verify(): @freeze_time(FROZEN_TIME) -@pytest.mark.slow +@pytest.mark.integration_batch_2 def test_tableau_signout_timeout(pytestconfig, tmp_path, mock_datahub_graph): enable_logging() output_file_name: str = "tableau_signout_timeout_mces.json" diff --git a/metadata-ingestion/tests/integration/trino/test_trino.py b/metadata-ingestion/tests/integration/trino/test_trino.py index 22e5f6f91a06ef..177c273c0d2424 100644 --- a/metadata-ingestion/tests/integration/trino/test_trino.py +++ b/metadata-ingestion/tests/integration/trino/test_trino.py @@ -13,6 +13,8 @@ from tests.test_helpers import fs_helpers, mce_helpers from tests.test_helpers.docker_helpers import wait_for_port +pytestmark = pytest.mark.integration_batch_1 + FROZEN_TIME = "2021-09-23 12:00:00" data_platform = "trino" @@ -51,7 +53,6 @@ def loaded_trino(trino_runner): @freeze_time(FROZEN_TIME) -@pytest.mark.integration @pytest.mark.xfail def test_trino_ingest( loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time @@ -111,7 +112,6 @@ def test_trino_ingest( @freeze_time(FROZEN_TIME) -@pytest.mark.integration def test_trino_hive_ingest( loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time ): @@ -167,7 +167,6 @@ def test_trino_hive_ingest( @freeze_time(FROZEN_TIME) -@pytest.mark.integration def test_trino_instance_ingest( loaded_trino, test_resources_dir, pytestconfig, tmp_path, mock_time ): diff --git a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json index 19961e48b4a336..c43223c68a6b64 100644 --- a/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json +++ b/metadata-ingestion/tests/integration/trino/trino_hive_mces_golden.json @@ -16,7 +16,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -31,7 +32,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -46,7 +48,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -63,7 +66,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -78,7 +82,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -99,7 +104,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -114,7 +120,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -129,7 +136,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -146,7 +154,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -161,7 +170,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -181,7 +191,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -196,7 +207,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -219,7 +231,7 @@ "numrows": "1", "rawdatasize": "32", "totalsize": "33", - "transient_lastddltime": "1688422059" + "transient_lastddltime": "1698223433" }, "name": "array_struct_test", "description": "This table has array of structs", @@ -315,7 +327,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -332,7 +345,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -356,7 +370,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -371,7 +386,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -392,7 +408,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688422063" + "transient_lastddltime": "1698223435" }, "name": "map_test", "tags": [] @@ -454,7 +470,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -471,7 +488,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -495,7 +513,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -510,7 +529,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -531,7 +551,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688422062" + "transient_lastddltime": "1698223435" }, "name": "nested_struct_test", "tags": [] @@ -642,7 +662,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -659,7 +680,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -683,7 +705,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -698,7 +721,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -714,7 +738,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1688421792" + "transient_lastddltime": "1698223429" }, "name": "pokes", "tags": [] @@ -784,7 +808,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -801,7 +826,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -825,7 +851,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -840,7 +867,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -861,7 +889,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688421808" + "transient_lastddltime": "1698223431" }, "name": "struct_test", "tags": [] @@ -950,7 +978,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -967,7 +996,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -991,7 +1021,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1006,7 +1037,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1024,7 +1056,7 @@ "customProperties": { "numfiles": "0", "totalsize": "0", - "transient_lastddltime": "1688422062" + "transient_lastddltime": "1698223435" }, "name": "struct_test_view_materialized", "tags": [] @@ -1113,7 +1145,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1130,7 +1163,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1154,7 +1188,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1169,7 +1204,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1190,7 +1226,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688421807" + "transient_lastddltime": "1698223431" }, "name": "_test_table_underscore", "tags": [] @@ -1248,7 +1284,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1265,7 +1302,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1289,7 +1327,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1304,7 +1343,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1325,7 +1365,7 @@ "numrows": "0", "rawdatasize": "0", "totalsize": "0", - "transient_lastddltime": "1688422062" + "transient_lastddltime": "1698223435" }, "name": "union_test", "tags": [] @@ -1467,7 +1507,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1484,7 +1525,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1508,7 +1550,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1523,7 +1566,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1539,7 +1583,7 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "transient_lastddltime": "1688422062", + "transient_lastddltime": "1698223435", "view_definition": "SELECT \"property_id\", \"service\"\nFROM \"db1\".\"array_struct_test\"", "is_view": "True" }, @@ -1634,7 +1678,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1651,7 +1696,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1668,7 +1714,57 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD)", + "type": "VIEW" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD),property_id)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD),property_id)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test,PROD),service)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:trino,hivedb.db1.array_struct_test_view,PROD),service)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1632398400000, + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } }, { @@ -1692,7 +1788,8 @@ }, "systemMetadata": { "lastObserved": 1632398400000, - "runId": "trino-hive-test" + "runId": "trino-hive-test", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/vertica/ddl.sql b/metadata-ingestion/tests/integration/vertica/ddl.sql index 59a71a1a1f7b59..ceebcd8e9ce2ad 100644 --- a/metadata-ingestion/tests/integration/vertica/ddl.sql +++ b/metadata-ingestion/tests/integration/vertica/ddl.sql @@ -1,5 +1,4 @@ -\set AUTOCOMMIT on -ALTER USER dbadmin IDENTIFIED BY 'abc123'; + -- Create a Top-k projection CREATE TABLE readings (meter_id INT, reading_date TIMESTAMP, reading_value FLOAT); @@ -35,12 +34,16 @@ SELECT tokenize(phrase) OVER () FROM phrases; -- Create a temp table -CREATE TEMPORARY TABLE sampletemp (a int, b int) ON COMMIT PRESERVE ROWS; -INSERT INTO sampletemp VALUES(1,2); +-- CREATE TEMPORARY TABLE sampletemp (a int, b int) ON COMMIT PRESERVE ROWS; +-- INSERT INTO sampletemp VALUES(1,2); -- Create partition key -ALTER TABLE store.store_orders_fact PARTITION BY date_ordered::DATE GROUP BY DATE_TRUNC('month', (date_ordered)::DATE); -SELECT PARTITION_TABLE('store.store_orders_fact'); -CREATE PROJECTION ytd_orders AS SELECT * FROM store.store_orders_fact ORDER BY date_ordered - ON PARTITION RANGE BETWEEN date_trunc('year',now())::date AND NULL; +-- ALTER TABLE store.store_orders_fact PARTITION BY date_ordered::DATE GROUP BY DATE_TRUNC('month', (date_ordered)::DATE); +-- SELECT PARTITION_TABLE('store.store_orders_fact'); +-- CREATE PROJECTION ytd_orders AS SELECT * FROM store.store_orders_fact ORDER BY date_ordered +-- ON PARTITION RANGE BETWEEN date_trunc('year',now())::date AND NULL; + + + + SELECT start_refresh(); \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/vertica/docker-compose.yml b/metadata-ingestion/tests/integration/vertica/docker-compose.yml index 84af5c32a60e30..1ba7990c826b2e 100644 --- a/metadata-ingestion/tests/integration/vertica/docker-compose.yml +++ b/metadata-ingestion/tests/integration/vertica/docker-compose.yml @@ -6,7 +6,7 @@ services: APP_DB_USER: "dbadmin" APP_DB_PASSWORD: "abc123" container_name: vertica-ce - image: vertica/vertica-ce:12.0.2-0 + image: vertica/vertica-ce:23.4.0-0 ports: - "5433:5433" - "5444:5444" diff --git a/metadata-ingestion/tests/integration/vertica/test_vertica.py b/metadata-ingestion/tests/integration/vertica/test_vertica.py index fe306d1d0b2b8b..d7b4c390f75d94 100644 --- a/metadata-ingestion/tests/integration/vertica/test_vertica.py +++ b/metadata-ingestion/tests/integration/vertica/test_vertica.py @@ -1,13 +1,12 @@ import subprocess -import time -from typing import List, Optional +from typing import List import pytest from freezegun import freeze_time from tests.test_helpers import mce_helpers from tests.test_helpers.click_helpers import run_datahub_cmd -from tests.test_helpers.docker_helpers import wait_for_port +from tests.test_helpers.docker_helpers import cleanup_image, wait_for_port FROZEN_TIME = "2020-04-14 07:00:00" @@ -17,13 +16,12 @@ def test_resources_dir(pytestconfig): return pytestconfig.rootpath / "tests/integration/vertica" -def is_vertica_responsive( - container_name: str, port: int, hostname: Optional[str] -) -> bool: - if hostname: - cmd = f"docker logs {container_name} 2>&1 | grep 'Vertica is now running' " - ret = subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL) - +def is_vertica_responsive(container_name: str) -> bool: + cmd = f"docker logs {container_name} 2>&1 | grep 'Vertica is now running' " + ret = subprocess.run( + cmd, + shell=True, + ) return ret.returncode == 0 @@ -37,28 +35,25 @@ def vertica_runner(docker_compose_runner, test_resources_dir): "vertica-ce", 5433, timeout=120, - checker=lambda: is_vertica_responsive( - "vertica-ce", 5433, hostname="vertica-ce" - ), + checker=lambda: is_vertica_responsive("vertica-ce"), ) commands = """ docker cp tests/integration/vertica/ddl.sql vertica-ce:/home/dbadmin/ && - docker exec vertica-ce sh -c "/opt/vertica/bin/vsql -w abc123 -f /home/dbadmin/ddl.sql + docker exec vertica-ce sh -c "/opt/vertica/bin/vsql -w abc123 -f /home/dbadmin/ddl.sql" """ ret = subprocess.run(commands, shell=True, stdout=subprocess.DEVNULL) - # waiting for vertica to create default table and system table and ml models - time.sleep(60) - assert ret.returncode >= 1 + assert ret.returncode == 0 yield docker_services + # The image is pretty large, so we remove it after the test. + cleanup_image("vertica/vertica-ce") + -# Test needs more work to be done , currently it is working fine. @freeze_time(FROZEN_TIME) -@pytest.mark.skip("Failing in CI, cmd failing with exit code 1") @pytest.mark.integration def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path): test_resources_dir = pytestconfig.rootpath / "tests/integration/vertica" @@ -72,7 +67,7 @@ def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path): ignore_paths: List[str] = [ r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['create_time'\]", r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['table_size'\]", - r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['projection_size'\]", + r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['Projection_size'\]", r"root\[\d+\]\['proposedSnapshot'\].+\['aspects'\].+\['customProperties'\]\['ROS_Count'\]", r"root\[\d+\]\['aspect'\].+\['customProperties'\]\['cluster_size'\]", r"root\[\d+\]\['aspect'\].+\['customProperties'\]\['udx_language'\]", diff --git a/metadata-ingestion/tests/integration/vertica/vertica_mces_with_db_golden.json b/metadata-ingestion/tests/integration/vertica/vertica_mces_with_db_golden.json index 44a5e07d7b9965..ef535158165da5 100644 --- a/metadata-ingestion/tests/integration/vertica/vertica_mces_with_db_golden.json +++ b/metadata-ingestion/tests/integration/vertica/vertica_mces_with_db_golden.json @@ -11,7 +11,7 @@ "env": "PROD", "database": "vmart", "cluster_type": "Enterprise", - "cluster_size": "122 GB", + "cluster_size": "101 GB", "subcluster": " ", "communal_storage_path": "" }, @@ -20,7 +20,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -35,7 +36,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -50,7 +52,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -67,7 +70,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -82,7 +86,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -97,16 +102,17 @@ "env": "PROD", "database": "vmart", "schema": "public", - "projection_count": "9", - "udx_list": "APPROXIMATE_COUNT_DISTINCT_SYNOPSIS_INFO, APPROXIMATE_MEDIAN, APPROXIMATE_PERCENTILE, AcdDataToCount, AcdDataToLongSyn, AcdDataToSyn, AcdSynToCount, AcdSynToSyn, DelimitedExport, DelimitedExportMulti, EmptyMap, Explode, FAvroParser, FCefParser, FCsvParser, FDelimitedPairParser, FDelimitedParser, FIDXParser, FJSONParser, FRegexParser, FlexTokenizer, JsonExport, JsonExportMulti, KafkaAvroParser, KafkaCheckBrokers, KafkaExport, KafkaInsertDelimiters, KafkaInsertLengths, KafkaJsonParser, KafkaListManyTopics, KafkaListTopics, KafkaOffsets, KafkaParser, KafkaSource, KafkaTopicDetails, MSE, MapAggregate, MapAggregate, MapContainsKey, MapContainsKey, MapContainsValue, MapContainsValue, MapDelimitedExtractor, MapItems, MapItems, MapJSONExtractor, MapKeys, MapKeys, MapKeysInfo, MapKeysInfo, MapLookup, MapLookup, MapLookup, MapPut, MapRegexExtractor, MapSize, MapSize, MapToString, MapToString, MapValues, MapValues, MapValuesOrField, MapVersion, MapVersion, OrcExport, OrcExportMulti, PRC, ParquetExport, ParquetExportMulti, PickBestType, PickBestType, PickBestType, ROC, STV_AsGeoJSON, STV_AsGeoJSON, STV_AsGeoJSON, STV_Create_Index, STV_Create_Index, STV_Create_Index, STV_DWithin, STV_DWithin, STV_DWithin, STV_Describe_Index, STV_Drop_Index, STV_Export2Shapefile, STV_Extent, STV_Extent, STV_ForceLHR, STV_Geography, STV_Geography, STV_GeographyPoint, STV_Geometry, STV_Geometry, STV_GeometryPoint, STV_GeometryPoint, STV_GetExportShapefileDirectory, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_IsValidReason, STV_IsValidReason, STV_IsValidReason, STV_LineStringPoint, STV_LineStringPoint, STV_LineStringPoint, STV_MemSize, STV_MemSize, STV_MemSize, STV_NN, STV_NN, STV_NN, STV_PolygonPoint, STV_PolygonPoint, STV_PolygonPoint, STV_Refresh_Index, STV_Refresh_Index, STV_Refresh_Index, STV_Rename_Index, STV_Reverse, STV_SetExportShapefileDirectory, STV_ShpCreateTable, STV_ShpParser, STV_ShpSource, ST_Area, ST_Area, ST_Area, ST_AsBinary, ST_AsBinary, ST_AsBinary, ST_AsText, ST_AsText, ST_AsText, ST_Boundary, ST_Buffer, ST_Centroid, ST_Contains, ST_Contains, ST_Contains, ST_ConvexHull, ST_Crosses, ST_Difference, ST_Disjoint, ST_Disjoint, ST_Disjoint, ST_Distance, ST_Distance, ST_Distance, ST_Envelope, ST_Equals, ST_Equals, ST_Equals, ST_GeoHash, ST_GeoHash, ST_GeoHash, ST_GeographyFromText, ST_GeographyFromWKB, ST_GeomFromGeoHash, ST_GeomFromGeoJSON, ST_GeomFromGeoJSON, ST_GeomFromText, ST_GeomFromText, ST_GeomFromWKB, ST_GeomFromWKB, ST_GeometryN, ST_GeometryN, ST_GeometryN, ST_GeometryType, ST_GeometryType, ST_GeometryType, ST_Intersection, ST_Intersects, ST_Intersects, ST_IsEmpty, ST_IsEmpty, ST_IsEmpty, ST_IsSimple, ST_IsSimple, ST_IsSimple, ST_IsValid, ST_IsValid, ST_IsValid, ST_Length, ST_Length, ST_Length, ST_NumGeometries, ST_NumGeometries, ST_NumGeometries, ST_NumPoints, ST_NumPoints, ST_NumPoints, ST_Overlaps, ST_PointFromGeoHash, ST_PointN, ST_PointN, ST_PointN, ST_Relate, ST_SRID, ST_SRID, ST_SRID, ST_Simplify, ST_SimplifyPreserveTopology, ST_SymDifference, ST_Touches, ST_Touches, ST_Touches, ST_Transform, ST_Union, ST_Union, ST_Within, ST_Within, ST_Within, ST_X, ST_X, ST_X, ST_XMax, ST_XMax, ST_XMax, ST_XMin, ST_XMin, ST_XMin, ST_Y, ST_Y, ST_Y, ST_YMax, ST_YMax, ST_YMax, ST_YMin, ST_YMin, ST_YMin, ST_intersects, SetMapKeys, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_NumCol, VoltageSecureAccess, VoltageSecureAccess, VoltageSecureConfigure, VoltageSecureConfigureGlobal, VoltageSecureProtect, VoltageSecureProtect, VoltageSecureProtectAllKeys, VoltageSecureRefreshPolicy, VoltageSecureVersion, append_centers, apply_bisecting_kmeans, apply_iforest, apply_inverse_pca, apply_inverse_svd, apply_kmeans, apply_normalize, apply_one_hot_encoder, apply_pca, apply_svd, approximate_quantiles, ar_create_blobs, ar_final_newton, ar_save_model, ar_transition_newton, avg_all_columns_local, bisecting_kmeans_init_model, bk_apply_best_kmeans_results, bk_compute_totss_local, bk_finalize_model, bk_get_rows_in_active_cluster, bk_kmeans_compute_local_centers, bk_kmeans_compute_withinss, bk_kmeans_fast_random_init, bk_kmeans_slow_random_init, bk_kmeanspp_init_cur_cluster, bk_kmeanspp_reset_blob, bk_kmeanspp_select_new_centers, bk_kmeanspp_within_chunk_sum, bk_save_final_model, bk_write_new_cluster_level, blob_to_table, bufUdx, bufUdx, calc_pseudo_centers, calculate_alpha_linear, calculate_hessian_linear1, calculate_hessian_linear2, cleanup_kmeans_files, compute_and_save_global_center, compute_and_save_new_centers, compute_local_totss, compute_local_withinss, compute_new_local_centers, confusion_matrix, coordinate_descent_covariance, corr_matrix, count_rows_in_blob, create_aggregator_blob, error_rate, evaluate_naive_bayes_model, evaluate_reg_model, evaluate_svm_model, export_model_files, finalize_blob_resource_group, get_attr_minmax, get_attr_robust_zscore, get_attr_zscore, get_model_attribute, get_model_summary, get_robust_zscore_median, iforest_create_blobs, iforest_phase0_udf1, iforest_phase0_udf2, iforest_phase1_udf1, iforest_phase1_udf2, iforest_phase1_udf3, iforest_phase1_udf4, iforest_phase2_udf1, iforest_phase2_udf2, iforest_phase2_udf3, iforest_phase2_udf4, iforest_save_model, import_model_files, isOrContains, kmeansAddMetricsToModel, kmeans_init_blobs, kmeans_to_write_final_centers, lift_table, line_search_logistic1, line_search_logistic2, load_rows_into_blocks, map_factor, math_op, matrix_global_xtx, matrix_local_xtx, mode_finder, model_converter, naive_bayes_phase1, naive_bayes_phase1_blob, naive_bayes_phase2, pca_prep1_global, pca_prep1_local, pca_prep2, pmml_parser, predict_autoregressor, predict_linear_reg, predict_logistic_reg, predict_moving_average, predict_naive_bayes, predict_naive_bayes_classes, predict_pmml, predict_rf_classifier, predict_rf_classifier_classes, predict_rf_regressor, predict_svm_classifier, predict_svm_regressor, predict_xgb_classifier, predict_xgb_classifier_classes, predict_xgb_regressor, random_init, random_init_write, read_from_dfblob, read_map_factor, read_ptree, read_tree, reg_final_bfgs, reg_final_newton, reg_transition_bfgs, reg_transition_newton, reg_write_model, remove_blob, reverse_normalize, rf_blob, rf_clean, rf_phase0_udf1, rf_phase0_udf2, rf_phase1_udf1, rf_phase1_udf2, rf_phase1_udf3, rf_phase1_udf4, rf_phase2_udf1, rf_phase2_udf2, rf_phase2_udf3, rf_phase2_udf4, rf_predictor_importance, rf_save_model, rsquared, save_cv_result, save_pca_model, save_svd_model, save_svm_model, select_new_centers, store_minmax_model, store_one_hot_encoder_model, store_robust_zscore_model, store_zscore_model, table_to_blob, table_to_dfblob, update_and_return_sum_of_squared_distances, upgrade_model_format, writeInitialKmeansModelToDfs, xgb_create_blobs, xgb_phase0_udf1, xgb_phase0_udf2, xgb_phase1_udf1, xgb_phase1_udf2, xgb_phase1_udf3, xgb_phase2_udf1, xgb_phase2_udf2, xgb_phase2_udf3, xgb_prune, xgb_save_model, yule_walker, ", - "udx_language": "ComplexTypesLib -- Functions for Complex Types | DelimitedExportLib -- Delimited data export package | JsonExportLib -- Json data export package | MachineLearningLib -- Machine learning package | OrcExportLib -- Orc export package | ParquetExportLib -- Parquet export package | ApproximateLib -- Approximate package | FlexTableLib -- Flexible Tables Data Load and Query | KafkaLib -- Kafka streaming load and export | PlaceLib -- Geospatial package | VoltageSecureLib -- Voltage SecureData Connector | " + "projection_count": "12", + "udx_list": "APPROXIMATE_COUNT_DISTINCT_SYNOPSIS_INFO, APPROXIMATE_MEDIAN, APPROXIMATE_PERCENTILE, AcdDataToCount, AcdDataToLongSyn, AcdDataToSyn, AcdSynToCount, AcdSynToSyn, DelimitedExport, DelimitedExportMulti, EmptyMap, Explode, FAvroParser, FCefParser, FCsvParser, FDelimitedPairParser, FDelimitedParser, FIDXParser, FJSONParser, FRegexParser, FlexTokenizer, JsonExport, JsonExportMulti, KafkaAvroParser, KafkaCheckBrokers, KafkaExport, KafkaInsertDelimiters, KafkaInsertLengths, KafkaJsonParser, KafkaListManyTopics, KafkaListTopics, KafkaOffsets, KafkaParser, KafkaSource, KafkaTopicDetails, MSE, MapAggregate, MapAggregate, MapContainsKey, MapContainsKey, MapContainsValue, MapContainsValue, MapDelimitedExtractor, MapItems, MapItems, MapJSONExtractor, MapKeys, MapKeys, MapKeysInfo, MapKeysInfo, MapLookup, MapLookup, MapLookup, MapPut, MapRegexExtractor, MapSize, MapSize, MapToString, MapToString, MapValues, MapValues, MapValuesOrField, MapVersion, MapVersion, OrcExport, OrcExportMulti, PRC, ParquetExport, ParquetExportMulti, PickBestType, PickBestType, PickBestType, ROC, STV_AsGeoJSON, STV_AsGeoJSON, STV_AsGeoJSON, STV_Create_Index, STV_Create_Index, STV_Create_Index, STV_DWithin, STV_DWithin, STV_DWithin, STV_Describe_Index, STV_Drop_Index, STV_Export2Shapefile, STV_Extent, STV_Extent, STV_ForceLHR, STV_Geography, STV_Geography, STV_GeographyPoint, STV_Geometry, STV_Geometry, STV_GeometryPoint, STV_GeometryPoint, STV_GetExportShapefileDirectory, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_Intersect, STV_IsValidReason, STV_IsValidReason, STV_IsValidReason, STV_LineStringPoint, STV_LineStringPoint, STV_LineStringPoint, STV_MemSize, STV_MemSize, STV_MemSize, STV_NN, STV_NN, STV_NN, STV_PolygonPoint, STV_PolygonPoint, STV_PolygonPoint, STV_Refresh_Index, STV_Refresh_Index, STV_Refresh_Index, STV_Rename_Index, STV_Reverse, STV_SetExportShapefileDirectory, STV_ShpCreateTable, STV_ShpParser, STV_ShpSource, ST_Area, ST_Area, ST_Area, ST_AsBinary, ST_AsBinary, ST_AsBinary, ST_AsText, ST_AsText, ST_AsText, ST_Boundary, ST_Buffer, ST_Centroid, ST_Contains, ST_Contains, ST_Contains, ST_ConvexHull, ST_Crosses, ST_Difference, ST_Disjoint, ST_Disjoint, ST_Disjoint, ST_Distance, ST_Distance, ST_Distance, ST_Envelope, ST_Equals, ST_Equals, ST_Equals, ST_GeoHash, ST_GeoHash, ST_GeoHash, ST_GeographyFromText, ST_GeographyFromWKB, ST_GeomFromGeoHash, ST_GeomFromGeoJSON, ST_GeomFromGeoJSON, ST_GeomFromText, ST_GeomFromText, ST_GeomFromWKB, ST_GeomFromWKB, ST_GeometryN, ST_GeometryN, ST_GeometryN, ST_GeometryType, ST_GeometryType, ST_GeometryType, ST_Intersection, ST_Intersects, ST_Intersects, ST_IsEmpty, ST_IsEmpty, ST_IsEmpty, ST_IsSimple, ST_IsSimple, ST_IsSimple, ST_IsValid, ST_IsValid, ST_IsValid, ST_Length, ST_Length, ST_Length, ST_NumGeometries, ST_NumGeometries, ST_NumGeometries, ST_NumPoints, ST_NumPoints, ST_NumPoints, ST_Overlaps, ST_PointFromGeoHash, ST_PointN, ST_PointN, ST_PointN, ST_Relate, ST_SRID, ST_SRID, ST_SRID, ST_Simplify, ST_SimplifyPreserveTopology, ST_SymDifference, ST_Touches, ST_Touches, ST_Touches, ST_Transform, ST_Union, ST_Union, ST_Within, ST_Within, ST_Within, ST_X, ST_X, ST_X, ST_XMax, ST_XMax, ST_XMax, ST_XMin, ST_XMin, ST_XMin, ST_Y, ST_Y, ST_Y, ST_YMax, ST_YMax, ST_YMax, ST_YMin, ST_YMin, ST_YMin, ST_intersects, SetMapKeys, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_CatCol, Summarize_NumCol, Unnest, VoltageSecureAccess, VoltageSecureAccess, VoltageSecureConfigure, VoltageSecureConfigureGlobal, VoltageSecureProtect, VoltageSecureProtect, VoltageSecureProtectAllKeys, VoltageSecureRefreshPolicy, VoltageSecureVersion, append_centers, apply_bisecting_kmeans, apply_iforest, apply_inverse_pca, apply_inverse_svd, apply_kmeans, apply_kprototypes, apply_normalize, apply_one_hot_encoder, apply_pca, apply_svd, approximate_quantiles, ar_create_blobs, ar_final_newton, ar_save_model, ar_transition_newton, arima_bfgs, arima_line_search, arima_save_model, avg_all_columns_local, bisecting_kmeans_init_model, bk_apply_best_kmeans_results, bk_compute_totss_local, bk_finalize_model, bk_get_rows_in_active_cluster, bk_kmeans_compute_local_centers, bk_kmeans_compute_withinss, bk_kmeans_fast_random_init, bk_kmeans_slow_random_init, bk_kmeanspp_init_cur_cluster, bk_kmeanspp_reset_blob, bk_kmeanspp_select_new_centers, bk_kmeanspp_within_chunk_sum, bk_save_final_model, bk_write_new_cluster_level, blob_to_table, bufUdx, bufUdx, calc_pseudo_centers, calculate_alpha_linear, calculate_hessian_linear1, calculate_hessian_linear2, chi_squared, cleanup_kmeans_files, compute_and_save_global_center, compute_and_save_new_centers, compute_local_totss, compute_local_withinss, compute_new_local_centers, confusion_matrix, coordinate_descent_covariance, corr_matrix, count_rows_in_blob, create_aggregator_blob, error_rate, evaluate_naive_bayes_model, evaluate_reg_model, evaluate_svm_model, export_model_files, finalize_blob_resource_group, get_attr_minmax, get_attr_robust_zscore, get_attr_zscore, get_model_attribute, get_model_summary, get_robust_zscore_median, iforest_create_blobs, iforest_phase0_udf1, iforest_phase0_udf2, iforest_phase1_udf1, iforest_phase1_udf2, iforest_phase1_udf3, iforest_phase1_udf4, iforest_phase2_udf1, iforest_phase2_udf2, iforest_phase2_udf3, iforest_phase2_udf4, iforest_save_model, import_model_files, isOrContains, kmeansAddMetricsToModel, kmeans_init_blobs, kmeans_to_write_final_centers, lift_table, line_search_logistic1, line_search_logistic2, load_rows_into_blocks, map_factor, math_op, matrix_global_xtx, matrix_local_xtx, mode_finder, model_converter, naive_bayes_phase1, naive_bayes_phase1_blob, naive_bayes_phase2, pca_prep1_global, pca_prep1_local, pca_prep2, pmml_parser, predict_arima, predict_autoregressor, predict_linear_reg, predict_logistic_reg, predict_moving_average, predict_naive_bayes, predict_naive_bayes_classes, predict_pmml, predict_poisson_reg, predict_rf_classifier, predict_rf_classifier_classes, predict_rf_regressor, predict_svm_classifier, predict_svm_regressor, predict_xgb_classifier, predict_xgb_classifier_classes, predict_xgb_regressor, random_init, random_init_write, read_from_dfblob, read_map_factor, read_ptree, read_tree, reg_final_bfgs, reg_final_newton, reg_transition_bfgs, reg_transition_newton, reg_write_model, remove_blob, reverse_normalize, rf_blob, rf_clean, rf_phase0_udf1, rf_phase0_udf2, rf_phase1_udf1, rf_phase1_udf2, rf_phase1_udf3, rf_phase1_udf4, rf_phase2_udf1, rf_phase2_udf2, rf_phase2_udf3, rf_phase2_udf4, rf_predictor_importance, rf_save_model, rsquared, save_cv_result, save_pca_model, save_svd_model, save_svm_model, select_new_centers, store_minmax_model, store_one_hot_encoder_model, store_robust_zscore_model, store_zscore_model, table_to_blob, table_to_dfblob, tokenize, topk, update_and_return_sum_of_squared_distances, upgrade_model_format, writeInitialKmeansModelToDfs, xgb_create_blobs, xgb_phase0_udf1, xgb_phase0_udf2, xgb_phase1_udf1, xgb_phase1_udf2, xgb_phase1_udf3, xgb_phase2_udf1, xgb_phase2_udf2, xgb_phase2_udf3, xgb_predictor_importance, xgb_prune, xgb_save_model, yule_walker, ", + "udx_language": "ComplexTypesLib -- Functions for Complex Types | DelimitedExportLib -- Delimited data export package | JsonExportLib -- Json data export package | MachineLearningLib -- Machine learning package | OrcExportLib -- Orc export package | ParquetExportLib -- Parquet export package | ApproximateLib -- Approximate package | FlexTableLib -- Flexible Tables Data Load and Query | KafkaLib -- Kafka streaming load and export | PlaceLib -- Geospatial package | VoltageSecureLib -- Voltage SecureData Connector | TransformFunctions -- User-defined Python library | " }, "name": "public" } }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -121,7 +127,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -136,7 +143,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -153,7 +161,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -168,7 +177,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -188,7 +198,184 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "create_time": "2023-10-13 11:23:05.308022+00:00", + "table_size": "0 KB" + }, + "name": "clicks", + "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.clicks", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "user_id", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "page_id", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "click_time", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.TimeType": {} + } + }, + "nativeDataType": "TIMESTAMP_WITH_PRECISION()", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -212,7 +399,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -227,7 +415,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -243,7 +432,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.358215+00:00" + "create_time": "2023-10-13 11:22:37.846965+00:00", + "table_size": "2119 KB" }, "name": "customer_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -551,7 +741,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -568,7 +759,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -592,7 +784,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -616,7 +809,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -631,7 +825,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -647,7 +842,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.368954+00:00" + "create_time": "2023-10-13 11:22:37.857152+00:00", + "table_size": "138 KB" }, "name": "date_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -955,7 +1151,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -972,7 +1169,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -996,7 +1194,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1020,7 +1219,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1035,7 +1235,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1051,7 +1252,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.375896+00:00" + "create_time": "2023-10-13 11:22:37.863745+00:00", + "table_size": "327 KB" }, "name": "employee_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -1320,7 +1522,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1337,7 +1540,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1361,7 +1565,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1385,7 +1590,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1400,7 +1606,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1416,7 +1623,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.385843+00:00" + "create_time": "2023-10-13 11:22:37.873181+00:00", + "table_size": "2564 KB" }, "name": "inventory_fact", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -1529,7 +1737,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1546,7 +1755,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1570,12 +1780,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -1594,12 +1805,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -1609,13 +1821,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -1625,16 +1838,17 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.362016+00:00" + "create_time": "2023-10-13 11:23:05.408507+00:00", + "table_size": "0 KB" }, - "name": "product_dimension", + "name": "phrases", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.product_dimension", + "schemaName": "public.phrases", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -1653,33 +1867,7 @@ }, "fields": [ { - "fieldPath": "product_key", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": true - }, - { - "fieldPath": "product_version", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "product_description", + "fieldPath": "phrase", "nullable": true, "description": "", "type": { @@ -1690,76 +1878,252 @@ "nativeDataType": "VARCHAR(length=128)", "recursive": false, "isPartOfKey": false - }, - { - "fieldPath": "sku_number", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "category_description", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "department_description", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "package_type_description", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "package_size", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "fat_content", - "nullable": true, - "description": "", + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "create_time": "2023-10-13 11:22:37.850505+00:00", + "table_size": "19 KB" + }, + "name": "product_dimension", + "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.product_dimension", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "product_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "product_version", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "product_description", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=128)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "sku_number", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "category_description", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "department_description", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "package_type_description", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "package_size", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "fat_content", + "nullable": true, + "description": "", "type": { "type": { "com.linkedin.pegasus2avro.schema.NumberType": {} @@ -1933,7 +2297,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1950,7 +2315,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1974,7 +2340,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -1998,7 +2365,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2013,7 +2381,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2029,7 +2398,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.365453+00:00" + "create_time": "2023-10-13 11:22:37.853878+00:00", + "table_size": "3 KB" }, "name": "promotion_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -2220,7 +2590,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2237,7 +2608,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2261,12 +2633,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -2285,12 +2658,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -2300,13 +2674,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -2316,16 +2691,17 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.379273+00:00" + "create_time": "2023-10-13 11:23:05.296044+00:00", + "table_size": "0 KB" }, - "name": "shipping_dimension", + "name": "readings", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.shipping_dimension", + "schemaName": "public.readings", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -2344,7 +2720,7 @@ }, "fields": [ { - "fieldPath": "shipping_key", + "fieldPath": "meter_id", "nullable": true, "description": "", "type": { @@ -2354,39 +2730,215 @@ }, "nativeDataType": "INTEGER()", "recursive": false, - "isPartOfKey": true + "isPartOfKey": false }, { - "fieldPath": "ship_type", + "fieldPath": "reading_date", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.TimeType": {} } }, - "nativeDataType": "CHAR(length=30)", + "nativeDataType": "TIMESTAMP_WITH_PRECISION()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "ship_mode", + "fieldPath": "reading_value", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=10)", + "nativeDataType": "FLOAT()", "recursive": false, "isPartOfKey": false - }, - { - "fieldPath": "ship_carrier", - "nullable": true, - "description": "", - "type": { + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Table" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "create_time": "2023-10-13 11:22:37.867119+00:00", + "table_size": "1 KB" + }, + "name": "shipping_dimension", + "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.shipping_dimension", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "shipping_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": true + }, + { + "fieldPath": "ship_type", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=30)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_mode", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=10)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_carrier", + "nullable": true, + "description": "", + "type": { "type": { "com.linkedin.pegasus2avro.schema.StringType": {} } @@ -2403,7 +2955,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2420,7 +2973,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2444,7 +2998,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2468,7 +3023,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2483,7 +3039,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2499,7 +3056,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.372409+00:00" + "create_time": "2023-10-13 11:22:37.860541+00:00", + "table_size": "1 KB" }, "name": "vendor_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -2638,7 +3196,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2655,7 +3214,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2679,7 +3239,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2703,7 +3264,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2718,7 +3280,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2734,7 +3297,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:38:20.045598+00:00" + "create_time": "2023-10-13 11:23:04.970568+00:00", + "table_size": "0 KB" }, "name": "vmart_load_success", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -2782,7 +3346,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2799,7 +3364,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2823,7 +3389,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2847,7 +3414,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2862,7 +3430,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -2878,7 +3447,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.382549+00:00" + "create_time": "2023-10-13 11:22:37.870169+00:00", + "table_size": "2 KB" }, "name": "warehouse_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -2991,7 +3561,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3008,7 +3579,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -3032,12 +3604,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -3056,12 +3629,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -3071,13 +3645,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -3087,23 +3662,19 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "ROS_Count": "1", - "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(date_dimension.date_key)", - "projection_size": "138 KB", - "Partition_Key": "Not Available", - "Partition_Size": "0", - "Projection_Cached": "False" + "create_time": "2023-10-13 11:23:05.319029+00:00", + "table_size": "0 KB", + "view_definition": "SELECT sum(customer_dimension.annual_income) AS SUM, customer_dimension.customer_state FROM public.customer_dimension WHERE (customer_dimension.customer_key IN (SELECT store_sales_fact.customer_key FROM store.store_sales_fact)) GROUP BY customer_dimension.customer_state ORDER BY customer_dimension.customer_state", + "is_view": "True" }, - "name": "date_dimension_super", - "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "name": "sampleview", + "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.date_dimension_super", + "schemaName": "public.sampleview", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -3122,7 +3693,7 @@ }, "fields": [ { - "fieldPath": "date_key", + "fieldPath": "SUM", "nullable": true, "description": "", "type": { @@ -3135,33 +3706,7 @@ "isPartOfKey": false }, { - "fieldPath": "date", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} - } - }, - "nativeDataType": "DATE()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "full_date_description", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=18)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_of_week", + "fieldPath": "customer_state", "nullable": true, "description": "", "type": { @@ -3169,228 +3714,7 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=9)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_number_in_calendar_month", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_number_in_calendar_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_number_in_fiscal_month", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "day_number_in_fiscal_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "last_day_in_week_indicator", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "last_day_in_month_indicator", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_week_number_in_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_month_name", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=9)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_month_number_in_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_year_month", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=7)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_quarter", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_year_quarter", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=7)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_half_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "calendar_year", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "holiday_indicator", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=10)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "weekday_indicator", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=7)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "selling_season", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=2)", "recursive": false, "isPartOfKey": false } @@ -3402,29 +3726,49 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { "json": { "typeNames": [ - "Projections" + "View" ] } }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", + "changeType": "UPSERT", + "aspectName": "viewProperties", + "aspect": { + "json": { + "materialized": false, + "viewLogic": "SELECT sum(customer_dimension.annual_income) AS SUM, customer_dimension.customer_state FROM public.customer_dimension WHERE (customer_dimension.customer_key IN (SELECT store_sales_fact.customer_key FROM store.store_sales_fact)) GROUP BY customer_dimension.customer_state ORDER BY customer_dimension.customer_state", + "viewLanguage": "SQL" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -3435,7 +3779,15 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,store.store_sales_fact,PROD)", "type": "TRANSFORMED" } ] @@ -3443,12 +3795,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.sampleview,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -3467,12 +3820,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -3491,12 +3845,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -3506,13 +3861,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -3524,21 +3880,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(product_dimension.product_key, product_dimension.product_version)", - "projection_size": "19 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(date_dimension.date_key)", + "Projection_size": "138 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "product_dimension_super", + "name": "date_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.product_dimension_super", + "schemaName": "public.date_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -3557,7 +3913,7 @@ }, "fields": [ { - "fieldPath": "product_key", + "fieldPath": "date_key", "nullable": true, "description": "", "type": { @@ -3570,20 +3926,20 @@ "isPartOfKey": false }, { - "fieldPath": "product_version", + "fieldPath": "date", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.DateType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "DATE()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "product_description", + "fieldPath": "full_date_description", "nullable": true, "description": "", "type": { @@ -3591,12 +3947,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=128)", + "nativeDataType": "VARCHAR(length=18)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "sku_number", + "fieldPath": "day_of_week", "nullable": true, "description": "", "type": { @@ -3604,64 +3960,64 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "VARCHAR(length=9)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "category_description", + "fieldPath": "day_number_in_calendar_month", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "department_description", + "fieldPath": "day_number_in_calendar_year", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "package_type_description", + "fieldPath": "day_number_in_fiscal_month", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "package_size", + "fieldPath": "day_number_in_fiscal_year", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "fat_content", + "fieldPath": "last_day_in_week_indicator", "nullable": true, "description": "", "type": { @@ -3674,20 +4030,20 @@ "isPartOfKey": false }, { - "fieldPath": "diet_type", + "fieldPath": "last_day_in_month_indicator", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "weight", + "fieldPath": "calendar_week_number_in_year", "nullable": true, "description": "", "type": { @@ -3700,7 +4056,7 @@ "isPartOfKey": false }, { - "fieldPath": "weight_units_of_measure", + "fieldPath": "calendar_month_name", "nullable": true, "description": "", "type": { @@ -3708,12 +4064,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "VARCHAR(length=9)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "shelf_width", + "fieldPath": "calendar_month_number_in_year", "nullable": true, "description": "", "type": { @@ -3726,20 +4082,20 @@ "isPartOfKey": false }, { - "fieldPath": "shelf_height", + "fieldPath": "calendar_year_month", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "CHAR(length=7)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "shelf_depth", + "fieldPath": "calendar_quarter", "nullable": true, "description": "", "type": { @@ -3752,20 +4108,20 @@ "isPartOfKey": false }, { - "fieldPath": "product_price", + "fieldPath": "calendar_year_quarter", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "CHAR(length=7)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "product_cost", + "fieldPath": "calendar_half_year", "nullable": true, "description": "", "type": { @@ -3778,7 +4134,7 @@ "isPartOfKey": false }, { - "fieldPath": "lowest_competitor_price", + "fieldPath": "calendar_year", "nullable": true, "description": "", "type": { @@ -3791,41 +4147,41 @@ "isPartOfKey": false }, { - "fieldPath": "highest_competitor_price", + "fieldPath": "holiday_indicator", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "VARCHAR(length=10)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "average_competitor_price", + "fieldPath": "weekday_indicator", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "CHAR(length=7)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "discontinued_flag", + "fieldPath": "selling_season", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false } @@ -3837,12 +4193,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -3854,12 +4211,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -3870,7 +4228,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension,PROD)", "type": "TRANSFORMED" } ] @@ -3878,12 +4236,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.date_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -3902,12 +4261,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -3926,12 +4286,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -3941,13 +4302,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -3959,21 +4321,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(promotion_dimension.promotion_key)", - "projection_size": "3 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(product_dimension.product_key, product_dimension.product_version)", + "Projection_size": "19 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "promotion_dimension_super", + "name": "product_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.promotion_dimension_super", + "schemaName": "public.product_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -3992,7 +4354,7 @@ }, "fields": [ { - "fieldPath": "promotion_key", + "fieldPath": "product_key", "nullable": true, "description": "", "type": { @@ -4005,7 +4367,20 @@ "isPartOfKey": false }, { - "fieldPath": "promotion_name", + "fieldPath": "product_version", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "product_description", "nullable": true, "description": "", "type": { @@ -4018,7 +4393,7 @@ "isPartOfKey": false }, { - "fieldPath": "price_reduction_type", + "fieldPath": "sku_number", "nullable": true, "description": "", "type": { @@ -4026,12 +4401,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "promotion_media_type", + "fieldPath": "category_description", "nullable": true, "description": "", "type": { @@ -4039,12 +4414,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "ad_type", + "fieldPath": "department_description", "nullable": true, "description": "", "type": { @@ -4052,12 +4427,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "display_type", + "fieldPath": "package_type_description", "nullable": true, "description": "", "type": { @@ -4065,12 +4440,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "coupon_type", + "fieldPath": "package_size", "nullable": true, "description": "", "type": { @@ -4078,12 +4453,25 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "ad_media_name", + "fieldPath": "fat_content", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "diet_type", "nullable": true, "description": "", "type": { @@ -4091,12 +4479,25 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "display_provider", + "fieldPath": "weight", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "weight_units_of_measure", "nullable": true, "description": "", "type": { @@ -4104,12 +4505,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=128)", + "nativeDataType": "CHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "promotion_cost", + "fieldPath": "shelf_width", "nullable": true, "description": "", "type": { @@ -4122,28 +4523,106 @@ "isPartOfKey": false }, { - "fieldPath": "promotion_begin_date", + "fieldPath": "shelf_height", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "DATE()", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "promotion_end_date", + "fieldPath": "shelf_depth", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "DATE()", + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "product_price", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "product_cost", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "lowest_competitor_price", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "highest_competitor_price", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "average_competitor_price", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "discontinued_flag", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false } @@ -4155,12 +4634,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -4172,12 +4652,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -4188,7 +4669,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension,PROD)", "type": "TRANSFORMED" } ] @@ -4196,12 +4677,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.product_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -4220,12 +4702,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -4244,12 +4727,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -4259,13 +4743,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -4277,21 +4762,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(vendor_dimension.vendor_key)", - "projection_size": "1 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(promotion_dimension.promotion_key)", + "Projection_size": "3 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "vendor_dimension_super", + "name": "promotion_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.vendor_dimension_super", + "schemaName": "public.promotion_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -4310,7 +4795,7 @@ }, "fields": [ { - "fieldPath": "vendor_key", + "fieldPath": "promotion_key", "nullable": true, "description": "", "type": { @@ -4323,7 +4808,7 @@ "isPartOfKey": false }, { - "fieldPath": "vendor_name", + "fieldPath": "promotion_name", "nullable": true, "description": "", "type": { @@ -4331,12 +4816,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=64)", + "nativeDataType": "VARCHAR(length=128)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vendor_address", + "fieldPath": "price_reduction_type", "nullable": true, "description": "", "type": { @@ -4344,12 +4829,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=64)", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vendor_city", + "fieldPath": "promotion_media_type", "nullable": true, "description": "", "type": { @@ -4357,12 +4842,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=64)", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vendor_state", + "fieldPath": "ad_type", "nullable": true, "description": "", "type": { @@ -4370,12 +4855,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "CHAR(length=2)", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vendor_region", + "fieldPath": "display_type", "nullable": true, "description": "", "type": { @@ -4388,7 +4873,46 @@ "isPartOfKey": false }, { - "fieldPath": "deal_size", + "fieldPath": "coupon_type", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ad_media_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "display_provider", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=128)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "promotion_cost", "nullable": true, "description": "", "type": { @@ -4401,7 +4925,20 @@ "isPartOfKey": false }, { - "fieldPath": "last_deal_update", + "fieldPath": "promotion_begin_date", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "promotion_end_date", "nullable": true, "description": "", "type": { @@ -4421,12 +4958,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -4438,12 +4976,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -4454,7 +4993,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension,PROD)", "type": "TRANSFORMED" } ] @@ -4462,12 +5001,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.promotion_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -4486,12 +5026,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -4510,12 +5051,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -4525,13 +5067,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -4543,21 +5086,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(customer_dimension.customer_key)", - "projection_size": "2119 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(vendor_dimension.vendor_key)", + "Projection_size": "1 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "customer_dimension_super", + "name": "vendor_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.customer_dimension_super", + "schemaName": "public.vendor_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -4576,7 +5119,7 @@ }, "fields": [ { - "fieldPath": "customer_key", + "fieldPath": "vendor_key", "nullable": true, "description": "", "type": { @@ -4589,46 +5132,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_type", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=16)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "customer_name", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=256)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "customer_gender", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=8)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "title", + "fieldPath": "vendor_name", "nullable": true, "description": "", "type": { @@ -4636,25 +5140,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=8)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "household_id", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", + "nativeDataType": "VARCHAR(length=64)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "customer_address", + "fieldPath": "vendor_address", "nullable": true, "description": "", "type": { @@ -4662,12 +5153,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=256)", + "nativeDataType": "VARCHAR(length=64)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "customer_city", + "fieldPath": "vendor_city", "nullable": true, "description": "", "type": { @@ -4680,7 +5171,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_state", + "fieldPath": "vendor_state", "nullable": true, "description": "", "type": { @@ -4693,20 +5184,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_region", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=64)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "marital_status", + "fieldPath": "vendor_region", "nullable": true, "description": "", "type": { @@ -4719,72 +5197,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_age", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "number_of_children", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "annual_income", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "occupation", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=64)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "largest_bill_amount", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "store_membership_card", + "fieldPath": "deal_size", "nullable": true, "description": "", "type": { @@ -4797,46 +5210,7 @@ "isPartOfKey": false }, { - "fieldPath": "customer_since", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} - } - }, - "nativeDataType": "DATE()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "deal_stage", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=32)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "deal_size", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "last_deal_update", + "fieldPath": "last_deal_update", "nullable": true, "description": "", "type": { @@ -4856,12 +5230,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -4873,12 +5248,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -4889,7 +5265,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension,PROD)", "type": "TRANSFORMED" } ] @@ -4897,12 +5273,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.vendor_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -4921,12 +5298,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -4945,12 +5323,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -4960,13 +5339,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -4978,21 +5358,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(employee_dimension.employee_key)", - "projection_size": "327 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(customer_dimension.customer_key)", + "Projection_size": "2119 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "employee_dimension_super", + "name": "customer_dimension_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.employee_dimension_super", + "schemaName": "public.customer_dimension_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -5011,7 +5391,7 @@ }, "fields": [ { - "fieldPath": "employee_key", + "fieldPath": "customer_key", "nullable": true, "description": "", "type": { @@ -5024,7 +5404,33 @@ "isPartOfKey": false }, { - "fieldPath": "employee_gender", + "fieldPath": "customer_type", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=16)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=256)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_gender", "nullable": true, "description": "", "type": { @@ -5037,7 +5443,7 @@ "isPartOfKey": false }, { - "fieldPath": "courtesy_title", + "fieldPath": "title", "nullable": true, "description": "", "type": { @@ -5050,7 +5456,33 @@ "isPartOfKey": false }, { - "fieldPath": "employee_first_name", + "fieldPath": "household_id", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_address", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=256)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_city", "nullable": true, "description": "", "type": { @@ -5063,7 +5495,7 @@ "isPartOfKey": false }, { - "fieldPath": "employee_middle_initial", + "fieldPath": "customer_state", "nullable": true, "description": "", "type": { @@ -5071,12 +5503,12 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "VARCHAR(length=8)", + "nativeDataType": "CHAR(length=2)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "employee_last_name", + "fieldPath": "customer_region", "nullable": true, "description": "", "type": { @@ -5089,7 +5521,20 @@ "isPartOfKey": false }, { - "fieldPath": "employee_age", + "fieldPath": "marital_status", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_age", "nullable": true, "description": "", "type": { @@ -5102,33 +5547,33 @@ "isPartOfKey": false }, { - "fieldPath": "hire_date", + "fieldPath": "number_of_children", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "DATE()", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "employee_street_address", + "fieldPath": "annual_income", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "VARCHAR(length=256)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "employee_city", + "fieldPath": "occupation", "nullable": true, "description": "", "type": { @@ -5141,20 +5586,46 @@ "isPartOfKey": false }, { - "fieldPath": "employee_state", + "fieldPath": "largest_bill_amount", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "CHAR(length=2)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "employee_region", + "fieldPath": "store_membership_card", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "customer_since", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "deal_stage", "nullable": true, "description": "", "type": { @@ -5162,25 +5633,1087 @@ "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "CHAR(length=32)", + "nativeDataType": "VARCHAR(length=32)", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "job_title", + "fieldPath": "deal_size", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "last_deal_update", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Projections" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.customer_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "ROS_Count": "1", + "Projection_Type": "is_super_projection", + "Is_Segmented": "True", + "Segmentation_key": "hash(employee_dimension.employee_key)", + "Projection_size": "327 KB", + "Partition_Key": "Not Available", + "Number_Of_Partitions": "0", + "Projection_Cached": "False" + }, + "name": "employee_dimension_super", + "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.employee_dimension_super", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "employee_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_gender", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=8)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "courtesy_title", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=8)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_first_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=64)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_middle_initial", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=8)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_last_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=64)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_age", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "hire_date", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "nativeDataType": "DATE()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_street_address", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=256)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_city", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=64)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_state", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "employee_region", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=32)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "job_title", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=64)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "reports_to", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "salaried_flag", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "annual_salary", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "hourly_rate", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "FLOAT()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "vacation_days", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Projections" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "ROS_Count": "1", + "Projection_Type": "is_super_projection", + "Is_Segmented": "True", + "Segmentation_key": "hash(warehouse_dimension.warehouse_key)", + "Projection_size": "2 KB", + "Partition_Key": "Not Available", + "Number_Of_Partitions": "0", + "Projection_Cached": "False" + }, + "name": "warehouse_dimension_super", + "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.warehouse_dimension_super", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "warehouse_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_name", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=20)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_address", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=256)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_city", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=60)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_state", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=2)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "warehouse_region", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "VARCHAR(length=32)", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Projections" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "ROS_Count": "1", + "Projection_Type": "is_super_projection", + "Is_Segmented": "True", + "Segmentation_key": "hash(shipping_dimension.shipping_key)", + "Projection_size": "1 KB", + "Partition_Key": "Not Available", + "Number_Of_Partitions": "0", + "Projection_Cached": "False" + }, + "name": "shipping_dimension_super", + "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.shipping_dimension_super", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "shipping_key", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "nativeDataType": "INTEGER()", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_type", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=30)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_mode", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=10)", + "recursive": false, + "isPartOfKey": false + }, + { + "fieldPath": "ship_carrier", + "nullable": true, + "description": "", + "type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "nativeDataType": "CHAR(length=20)", + "recursive": false, + "isPartOfKey": false + } + ] + } + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "subTypes", + "aspect": { + "json": { + "typeNames": [ + "Projections" + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "type": "TRANSFORMED" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "changeType": "UPSERT", + "aspectName": "browsePathsV2", + "aspect": { + "json": { + "path": [ + { + "id": "urn:li:container:343f520ad0fb3259b298736800bb1385", + "urn": "urn:li:container:343f520ad0fb3259b298736800bb1385" + }, + { + "id": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4", + "urn": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "changeType": "UPSERT", + "aspectName": "ownership", + "aspect": { + "json": { + "owners": [ + { + "owner": "urn:li:corpuser:dbadmin", + "type": "DATAOWNER" + } + ], + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + } + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "changeType": "UPSERT", + "aspectName": "container", + "aspect": { + "json": { + "container": "urn:li:container:eb682025a9113b5543ec7ed26bfa21e4" + } + }, + "systemMetadata": { + "lastObserved": 1586847600000, + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" + } +}, +{ + "proposedSnapshot": { + "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "aspects": [ + { + "com.linkedin.pegasus2avro.common.Status": { + "removed": false + } + }, + { + "com.linkedin.pegasus2avro.dataset.DatasetProperties": { + "customProperties": { + "ROS_Count": "1", + "Projection_Type": "is_super_projection", + "Is_Segmented": "True", + "Segmentation_key": "hash(inventory_fact.date_key, inventory_fact.product_key, inventory_fact.product_version, inventory_fact.warehouse_key, inventory_fact.qty_in_stock)", + "Projection_size": "2564 KB", + "Partition_Key": "Not Available", + "Number_Of_Partitions": "0", + "Projection_Cached": "False" + }, + "name": "inventory_fact_super", + "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", + "tags": [] + } + }, + { + "com.linkedin.pegasus2avro.schema.SchemaMetadata": { + "schemaName": "public.inventory_fact_super", + "platform": "urn:li:dataPlatform:vertica", + "version": 0, + "created": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "lastModified": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "hash": "", + "platformSchema": { + "com.linkedin.pegasus2avro.schema.MySqlDDL": { + "tableSchema": "" + } + }, + "fields": [ + { + "fieldPath": "date_key", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "VARCHAR(length=64)", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "reports_to", + "fieldPath": "product_key", "nullable": true, "description": "", "type": { @@ -5193,7 +6726,7 @@ "isPartOfKey": false }, { - "fieldPath": "salaried_flag", + "fieldPath": "product_version", "nullable": true, "description": "", "type": { @@ -5206,7 +6739,7 @@ "isPartOfKey": false }, { - "fieldPath": "annual_salary", + "fieldPath": "warehouse_key", "nullable": true, "description": "", "type": { @@ -5219,7 +6752,7 @@ "isPartOfKey": false }, { - "fieldPath": "hourly_rate", + "fieldPath": "qty_in_stock", "nullable": true, "description": "", "type": { @@ -5227,20 +6760,20 @@ "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "FLOAT()", + "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "vacation_days", + "fieldPath": "inventory_date", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} + "com.linkedin.pegasus2avro.schema.DateType": {} } }, - "nativeDataType": "INTEGER()", + "nativeDataType": "DATE()", "recursive": false, "isPartOfKey": false } @@ -5252,12 +6785,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -5269,12 +6803,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -5285,7 +6820,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact,PROD)", "type": "TRANSFORMED" } ] @@ -5293,12 +6828,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.employee_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -5317,12 +6853,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -5341,12 +6878,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -5356,13 +6894,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -5373,22 +6912,22 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "ROS_Count": "1", - "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(warehouse_dimension.warehouse_key)", - "projection_size": "2 KB", + "Projection_Type": "is_aggregate_projection, has_expressions", + "Is_Segmented": "True", + "Segmentation_key": "hash(readings.meter_id)", + "Projection_size": "0 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "warehouse_dimension_super", + "name": "readings_topk", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.warehouse_dimension_super", + "schemaName": "public.readings_topk", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -5407,7 +6946,7 @@ }, "fields": [ { - "fieldPath": "warehouse_key", + "fieldPath": "meter_id", "nullable": true, "description": "", "type": { @@ -5420,67 +6959,28 @@ "isPartOfKey": false }, { - "fieldPath": "warehouse_name", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=20)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "warehouse_address", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=256)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "warehouse_city", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "VARCHAR(length=60)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "warehouse_state", + "fieldPath": "recent_date", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.TimeType": {} } }, - "nativeDataType": "CHAR(length=2)", + "nativeDataType": "TIMESTAMP_WITH_PRECISION()", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "warehouse_region", + "fieldPath": "recent_value", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} + "com.linkedin.pegasus2avro.schema.NumberType": {} } }, - "nativeDataType": "VARCHAR(length=32)", + "nativeDataType": "FLOAT()", "recursive": false, "isPartOfKey": false } @@ -5492,12 +6992,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -5509,12 +7010,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -5525,7 +7027,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "type": "TRANSFORMED" } ] @@ -5533,12 +7035,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.warehouse_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.readings_topk,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -5557,12 +7060,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -5581,12 +7085,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -5596,13 +7101,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -5613,22 +7119,22 @@ "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { "ROS_Count": "1", - "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(shipping_dimension.shipping_key)", - "projection_size": "1 KB", + "Projection_Type": "is_aggregate_projection, has_expressions", + "Is_Segmented": "True", + "Segmentation_key": "hash(clicks.page_id, (clicks.click_time)::date)", + "Projection_size": "0 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "shipping_dimension_super", + "name": "clicks_agg", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.shipping_dimension_super", + "schemaName": "public.clicks_agg", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -5647,7 +7153,7 @@ }, "fields": [ { - "fieldPath": "shipping_key", + "fieldPath": "page_id", "nullable": true, "description": "", "type": { @@ -5658,45 +7164,6 @@ "nativeDataType": "INTEGER()", "recursive": false, "isPartOfKey": false - }, - { - "fieldPath": "ship_type", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=30)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "ship_mode", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=10)", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "ship_carrier", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.StringType": {} - } - }, - "nativeDataType": "CHAR(length=20)", - "recursive": false, - "isPartOfKey": false } ] } @@ -5706,12 +7173,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -5723,12 +7191,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -5739,7 +7208,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "type": "TRANSFORMED" } ] @@ -5747,12 +7216,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.shipping_dimension_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.clicks_agg,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -5771,12 +7241,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "ownership", "aspect": { @@ -5795,12 +7266,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "container", "aspect": { @@ -5810,13 +7282,14 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "proposedSnapshot": { "com.linkedin.pegasus2avro.metadata.snapshot.DatasetSnapshot": { - "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "urn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "aspects": [ { "com.linkedin.pegasus2avro.common.Status": { @@ -5828,21 +7301,21 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", - "Segmentation_key": "hash(inventory_fact.date_key, inventory_fact.product_key, inventory_fact.product_version, inventory_fact.warehouse_key, inventory_fact.qty_in_stock)", - "projection_size": "2566 KB", + "Is_Segmented": "True", + "Segmentation_key": "hash(phrases.phrase)", + "Projection_size": "0 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, - "name": "inventory_fact_super", + "name": "phrases_super", "description": "Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution For more info on projections and corresponding properties check out the Vertica Docs: https://www.vertica.com/docs", "tags": [] } }, { "com.linkedin.pegasus2avro.schema.SchemaMetadata": { - "schemaName": "public.inventory_fact_super", + "schemaName": "public.phrases_super", "platform": "urn:li:dataPlatform:vertica", "version": 0, "created": { @@ -5861,80 +7334,15 @@ }, "fields": [ { - "fieldPath": "date_key", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "product_key", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "product_version", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "warehouse_key", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "qty_in_stock", - "nullable": true, - "description": "", - "type": { - "type": { - "com.linkedin.pegasus2avro.schema.NumberType": {} - } - }, - "nativeDataType": "INTEGER()", - "recursive": false, - "isPartOfKey": false - }, - { - "fieldPath": "inventory_date", + "fieldPath": "phrase", "nullable": true, "description": "", "type": { "type": { - "com.linkedin.pegasus2avro.schema.DateType": {} + "com.linkedin.pegasus2avro.schema.StringType": {} } }, - "nativeDataType": "DATE()", + "nativeDataType": "VARCHAR(length=128)", "recursive": false, "isPartOfKey": false } @@ -5946,12 +7354,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "subTypes", "aspect": { @@ -5963,12 +7372,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "upstreamLineage", "aspect": { @@ -5979,7 +7389,7 @@ "time": 0, "actor": "urn:li:corpuser:unknown" }, - "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact,PROD)", + "dataset": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases,PROD)", "type": "TRANSFORMED" } ] @@ -5987,12 +7397,13 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { "entityType": "dataset", - "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.inventory_fact_super,PROD)", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:vertica,public.phrases_super,PROD)", "changeType": "UPSERT", "aspectName": "browsePathsV2", "aspect": { @@ -6011,7 +7422,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6035,7 +7447,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6050,7 +7463,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6065,7 +7479,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6082,7 +7497,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6097,7 +7513,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6117,7 +7534,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6141,7 +7559,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6156,7 +7575,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6172,7 +7592,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.393181+00:00" + "create_time": "2023-10-13 11:22:37.879951+00:00", + "table_size": "2 KB" }, "name": "store_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -6441,7 +7862,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6458,7 +7880,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6482,7 +7905,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6506,7 +7930,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6521,7 +7946,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6537,7 +7963,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.404717+00:00" + "create_time": "2023-10-13 11:22:37.890717+00:00", + "table_size": "8646 KB" }, "name": "store_orders_fact", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -6819,7 +8246,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6836,7 +8264,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6860,7 +8289,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6884,7 +8314,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6899,7 +8330,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -6915,7 +8347,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.396731+00:00" + "create_time": "2023-10-13 11:22:37.883186+00:00", + "table_size": "225060 KB" }, "name": "store_sales_fact", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -7171,7 +8604,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7188,7 +8622,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7212,7 +8647,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7236,7 +8672,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7251,7 +8688,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7269,11 +8707,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(store_dimension.store_key)", - "projection_size": "2 KB", + "Projection_size": "2 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "store_dimension_super", @@ -7543,7 +8981,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7560,7 +8999,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7584,7 +9024,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7608,7 +9049,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7632,7 +9074,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7647,7 +9090,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7665,11 +9109,11 @@ "customProperties": { "ROS_Count": "2", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(store_sales_fact.date_key, store_sales_fact.product_key, store_sales_fact.product_version, store_sales_fact.store_key, store_sales_fact.promotion_key, store_sales_fact.customer_key, store_sales_fact.employee_key, store_sales_fact.pos_transaction_number)", - "projection_size": "225089 KB", + "Projection_size": "225060 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "store_sales_fact_super", @@ -7926,7 +9370,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7943,7 +9388,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7967,7 +9413,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -7991,7 +9438,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8015,7 +9463,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8030,7 +9479,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8048,11 +9498,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(store_orders_fact.product_key, store_orders_fact.product_version, store_orders_fact.store_key, store_orders_fact.vendor_key, store_orders_fact.employee_key, store_orders_fact.order_number, store_orders_fact.date_ordered, store_orders_fact.date_shipped)", - "projection_size": "8648 KB", + "Projection_size": "8646 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "store_orders_fact_super", @@ -8335,7 +9785,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8352,7 +9803,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8376,7 +9828,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8400,7 +9853,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8424,7 +9878,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8439,7 +9894,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8454,7 +9910,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8471,7 +9928,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8486,7 +9944,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8506,7 +9965,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8530,7 +9990,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8545,7 +10006,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8561,7 +10023,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.415595+00:00" + "create_time": "2023-10-13 11:22:37.900841+00:00", + "table_size": "6 KB" }, "name": "call_center_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -8752,7 +10215,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8769,7 +10233,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8793,7 +10258,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8817,7 +10283,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8832,7 +10299,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8848,7 +10316,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.412266+00:00" + "create_time": "2023-10-13 11:22:37.897788+00:00", + "table_size": "9 KB" }, "name": "online_page_dimension", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -8961,7 +10430,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -8978,7 +10448,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9002,7 +10473,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9026,7 +10498,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9041,7 +10514,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9057,7 +10531,8 @@ { "com.linkedin.pegasus2avro.dataset.DatasetProperties": { "customProperties": { - "create_time": "2023-07-26 06:37:53.419260+00:00" + "create_time": "2023-10-13 11:22:37.903963+00:00", + "table_size": "182356 KB" }, "name": "online_sales_fact", "description": "References the properties of a native table in Vertica. Vertica physically stores table data in projections, which are collections of table columns. Projections store data in a format that optimizes query execution. In order to query or perform any operation on a Vertica table, the table must have one or more projections associated with it. ", @@ -9352,7 +10827,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9369,7 +10845,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9393,7 +10870,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9417,7 +10895,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9432,7 +10911,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9450,11 +10930,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(online_page_dimension.online_page_key)", - "projection_size": "9 KB", + "Projection_size": "9 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "online_page_dimension_super", @@ -9568,7 +11048,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9585,7 +11066,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9609,7 +11091,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9633,7 +11116,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9657,7 +11141,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9672,7 +11157,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9690,11 +11176,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(call_center_dimension.call_center_key)", - "projection_size": "6 KB", + "Projection_size": "6 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "call_center_dimension_super", @@ -9886,7 +11372,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9903,7 +11390,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9927,7 +11415,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9951,7 +11440,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9975,7 +11465,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -9990,7 +11481,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -10008,11 +11500,11 @@ "customProperties": { "ROS_Count": "1", "Projection_Type": "is_super_projection", - "is_segmented": "True", + "Is_Segmented": "True", "Segmentation_key": "hash(online_sales_fact.sale_date_key, online_sales_fact.ship_date_key, online_sales_fact.product_key, online_sales_fact.product_version, online_sales_fact.customer_key, online_sales_fact.call_center_key, online_sales_fact.online_page_key, online_sales_fact.shipping_key)", - "projection_size": "182385 KB", + "Projection_size": "182356 KB", "Partition_Key": "Not Available", - "Partition_Size": "0", + "Number_Of_Partitions": "0", "Projection_Cached": "False" }, "name": "online_sales_fact_super", @@ -10308,7 +11800,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -10325,7 +11818,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -10349,7 +11843,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } }, { @@ -10373,7 +11868,8 @@ }, "systemMetadata": { "lastObserved": 1586847600000, - "runId": "vertica-2020_04_14-07_00_00" + "runId": "vertica-2020_04_14-07_00_00", + "lastRunId": "no-run-id-provided" } } ] \ No newline at end of file diff --git a/metadata-ingestion/tests/integration/vertica/vertica_to_file.yml b/metadata-ingestion/tests/integration/vertica/vertica_to_file.yml index ebd800ee09ff57..a182e54bd53c71 100644 --- a/metadata-ingestion/tests/integration/vertica/vertica_to_file.yml +++ b/metadata-ingestion/tests/integration/vertica/vertica_to_file.yml @@ -5,6 +5,13 @@ source: database: Vmart username: dbadmin password: abc123 + include_tables: true + include_views: true + include_projections: true + include_models: true + include_view_lineage: true + include_projection_lineage: true + sink: type: file diff --git a/metadata-ingestion/tests/performance/bigquery/__init__.py b/metadata-ingestion/tests/performance/bigquery/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/performance/bigquery.py b/metadata-ingestion/tests/performance/bigquery/bigquery_events.py similarity index 100% rename from metadata-ingestion/tests/performance/bigquery.py rename to metadata-ingestion/tests/performance/bigquery/bigquery_events.py diff --git a/metadata-ingestion/tests/performance/test_bigquery_usage.py b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py similarity index 80% rename from metadata-ingestion/tests/performance/test_bigquery_usage.py rename to metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py index 7e05ef070b45d6..bbc3378450bffd 100644 --- a/metadata-ingestion/tests/performance/test_bigquery_usage.py +++ b/metadata-ingestion/tests/performance/bigquery/test_bigquery_usage.py @@ -2,13 +2,11 @@ import os import random from datetime import timedelta -from typing import Iterable, Tuple import humanfriendly import psutil from datahub.emitter.mce_builder import make_dataset_urn -from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.source.bigquery_v2.bigquery_config import ( BigQueryUsageConfig, BigQueryV2Config, @@ -16,12 +14,13 @@ from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.usage import BigQueryUsageExtractor from datahub.utilities.perf_timer import PerfTimer -from tests.performance.bigquery import generate_events, ref_from_table +from tests.performance.bigquery.bigquery_events import generate_events, ref_from_table from tests.performance.data_generation import ( NormalDistribution, generate_data, generate_queries, ) +from tests.performance.helpers import workunit_sink def run_test(): @@ -33,7 +32,7 @@ def run_test(): num_views=2000, time_range=timedelta(days=7), ) - all_tables = seed_metadata.tables + seed_metadata.views + all_tables = seed_metadata.all_tables config = BigQueryV2Config( start_time=seed_metadata.start_time, @@ -88,21 +87,6 @@ def run_test(): print(f"Hash collisions: {report.num_usage_query_hash_collisions}") -def workunit_sink(workunits: Iterable[MetadataWorkUnit]) -> Tuple[int, int]: - peak_memory_usage = psutil.Process(os.getpid()).memory_info().rss - i: int = 0 - for i, wu in enumerate(workunits): - if i % 10_000 == 0: - peak_memory_usage = max( - peak_memory_usage, psutil.Process(os.getpid()).memory_info().rss - ) - peak_memory_usage = max( - peak_memory_usage, psutil.Process(os.getpid()).memory_info().rss - ) - - return i, peak_memory_usage - - if __name__ == "__main__": root_logger = logging.getLogger() root_logger.setLevel(logging.INFO) diff --git a/metadata-ingestion/tests/performance/data_generation.py b/metadata-ingestion/tests/performance/data_generation.py index c530848f27f5c4..67b156896909a2 100644 --- a/metadata-ingestion/tests/performance/data_generation.py +++ b/metadata-ingestion/tests/performance/data_generation.py @@ -11,11 +11,14 @@ import uuid from dataclasses import dataclass from datetime import datetime, timedelta, timezone -from typing import Iterable, List, TypeVar +from typing import Iterable, List, TypeVar, Union, cast from faker import Faker from tests.performance.data_model import ( + Column, + ColumnMapping, + ColumnType, Container, FieldAccess, Query, @@ -52,15 +55,21 @@ def sample_with_floor(self, floor: int = 1) -> int: @dataclass class SeedMetadata: - containers: List[Container] + # Each list is a layer of containers, e.g. [[databases], [schemas]] + containers: List[List[Container]] + tables: List[Table] views: List[View] start_time: datetime end_time: datetime + @property + def all_tables(self) -> List[Table]: + return self.tables + cast(List[Table], self.views) + def generate_data( - num_containers: int, + num_containers: Union[List[int], int], num_tables: int, num_views: int, columns_per_table: NormalDistribution = NormalDistribution(5, 2), @@ -68,32 +77,52 @@ def generate_data( view_definition_length: NormalDistribution = NormalDistribution(150, 50), time_range: timedelta = timedelta(days=14), ) -> SeedMetadata: - containers = [Container(f"container-{i}") for i in range(num_containers)] + # Assemble containers + if isinstance(num_containers, int): + num_containers = [num_containers] + + containers: List[List[Container]] = [] + for i, num_in_layer in enumerate(num_containers): + layer = [ + Container( + f"{i}-container-{j}", + parent=random.choice(containers[-1]) if containers else None, + ) + for j in range(num_in_layer) + ] + containers.append(layer) + + # Assemble tables tables = [ Table( f"table-{i}", - container=random.choice(containers), + container=random.choice(containers[-1]), columns=[ f"column-{j}-{uuid.uuid4()}" for j in range(columns_per_table.sample_with_floor()) ], + column_mapping=None, ) for i in range(num_tables) ] views = [ View( f"view-{i}", - container=random.choice(containers), + container=random.choice(containers[-1]), columns=[ f"column-{j}-{uuid.uuid4()}" for j in range(columns_per_table.sample_with_floor()) ], + column_mapping=None, definition=f"{uuid.uuid4()}-{'*' * view_definition_length.sample_with_floor(10)}", parents=random.sample(tables, parents_per_view.sample_with_floor()), ) for i in range(num_views) ] + for table in tables + views: + _generate_column_mapping(table) + now = datetime.now(tz=timezone.utc) return SeedMetadata( containers=containers, @@ -162,6 +191,18 @@ def generate_queries( ) +def _generate_column_mapping(table: Table) -> ColumnMapping: + d = {} + for column in table.columns: + d[column] = Column( + name=column, + type=random.choice(list(ColumnType)), + nullable=random.random() < 0.1, # Fixed 10% chance for now + ) + table.column_mapping = d + return d + + def _sample_list(lst: List[T], dist: NormalDistribution, floor: int = 1) -> List[T]: return random.sample(lst, min(dist.sample_with_floor(floor), len(lst))) diff --git a/metadata-ingestion/tests/performance/data_model.py b/metadata-ingestion/tests/performance/data_model.py index c593e69ceb9a74..9425fa827070eb 100644 --- a/metadata-ingestion/tests/performance/data_model.py +++ b/metadata-ingestion/tests/performance/data_model.py @@ -1,10 +1,10 @@ from dataclasses import dataclass from datetime import datetime -from typing import List, Optional +from enum import Enum +from typing import Dict, List, Optional from typing_extensions import Literal -Column = str StatementType = Literal[ # SELECT + values from OperationTypeClass "SELECT", "INSERT", @@ -21,13 +21,36 @@ @dataclass class Container: name: str + parent: Optional["Container"] = None + + +class ColumnType(str, Enum): + # Can add types that take parameters in the future + + INTEGER = "INTEGER" + FLOAT = "FLOAT" # Double precision (64 bit) + STRING = "STRING" + BOOLEAN = "BOOLEAN" + DATETIME = "DATETIME" + + +@dataclass +class Column: + name: str + type: ColumnType + nullable: bool + + +ColumnRef = str +ColumnMapping = Dict[ColumnRef, Column] @dataclass class Table: name: str container: Container - columns: List[Column] + columns: List[ColumnRef] + column_mapping: Optional[ColumnMapping] def is_view(self) -> bool: return False @@ -44,7 +67,7 @@ def is_view(self) -> bool: @dataclass class FieldAccess: - column: Column + column: ColumnRef table: Table diff --git a/metadata-ingestion/tests/performance/databricks/__init__.py b/metadata-ingestion/tests/performance/databricks/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/performance/databricks/test_unity.py b/metadata-ingestion/tests/performance/databricks/test_unity.py new file mode 100644 index 00000000000000..cc9558f0692edf --- /dev/null +++ b/metadata-ingestion/tests/performance/databricks/test_unity.py @@ -0,0 +1,71 @@ +import logging +import os +from unittest.mock import patch + +import humanfriendly +import psutil + +from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.source.unity.config import UnityCatalogSourceConfig +from datahub.ingestion.source.unity.source import UnityCatalogSource +from datahub.utilities.perf_timer import PerfTimer +from tests.performance.data_generation import ( + NormalDistribution, + generate_data, + generate_queries, +) +from tests.performance.databricks.unity_proxy_mock import UnityCatalogApiProxyMock +from tests.performance.helpers import workunit_sink + + +def run_test(): + seed_metadata = generate_data( + num_containers=[1, 100, 5000], + num_tables=50000, + num_views=10000, + columns_per_table=NormalDistribution(100, 50), + parents_per_view=NormalDistribution(5, 5), + view_definition_length=NormalDistribution(1000, 300), + ) + queries = generate_queries( + seed_metadata, + num_selects=100000, + num_operations=100000, + num_unique_queries=10000, + num_users=1000, + ) + proxy_mock = UnityCatalogApiProxyMock( + seed_metadata, queries=queries, num_service_principals=10000 + ) + print("Data generated") + + config = UnityCatalogSourceConfig( + token="", workspace_url="http://localhost:1234", include_usage_statistics=False + ) + ctx = PipelineContext(run_id="test") + with patch( + "datahub.ingestion.source.unity.source.UnityCatalogApiProxy", + lambda *args, **kwargs: proxy_mock, + ): + source: UnityCatalogSource = UnityCatalogSource(ctx, config) + + pre_mem_usage = psutil.Process(os.getpid()).memory_info().rss + print(f"Test data size: {humanfriendly.format_size(pre_mem_usage)}") + + with PerfTimer() as timer: + workunits = source.get_workunits() + num_workunits, peak_memory_usage = workunit_sink(workunits) + print(f"Workunits Generated: {num_workunits}") + print(f"Seconds Elapsed: {timer.elapsed_seconds():.2f} seconds") + + print( + f"Peak Memory Used: {humanfriendly.format_size(peak_memory_usage - pre_mem_usage)}" + ) + print(source.report.aspects) + + +if __name__ == "__main__": + root_logger = logging.getLogger() + root_logger.setLevel(logging.INFO) + root_logger.addHandler(logging.StreamHandler()) + run_test() diff --git a/metadata-ingestion/tests/performance/databricks/unity_proxy_mock.py b/metadata-ingestion/tests/performance/databricks/unity_proxy_mock.py new file mode 100644 index 00000000000000..593163e12bf0ac --- /dev/null +++ b/metadata-ingestion/tests/performance/databricks/unity_proxy_mock.py @@ -0,0 +1,183 @@ +import uuid +from collections import defaultdict +from datetime import datetime, timezone +from typing import Dict, Iterable, List + +from databricks.sdk.service.catalog import ColumnTypeName +from databricks.sdk.service.sql import QueryStatementType + +from datahub.ingestion.source.unity.proxy_types import ( + Catalog, + CatalogType, + Column, + Metastore, + Query, + Schema, + ServicePrincipal, + Table, + TableType, +) +from tests.performance import data_model +from tests.performance.data_generation import SeedMetadata +from tests.performance.data_model import ColumnType, StatementType + + +class UnityCatalogApiProxyMock: + """Mimics UnityCatalogApiProxy for performance testing.""" + + def __init__( + self, + seed_metadata: SeedMetadata, + queries: Iterable[data_model.Query] = (), + num_service_principals: int = 0, + ) -> None: + self.seed_metadata = seed_metadata + self.queries = queries + self.num_service_principals = num_service_principals + self.warehouse_id = "invalid-warehouse-id" + + # Cache for performance + self._schema_to_table: Dict[str, List[data_model.Table]] = defaultdict(list) + for table in seed_metadata.all_tables: + self._schema_to_table[table.container.name].append(table) + + def check_basic_connectivity(self) -> bool: + return True + + def assigned_metastore(self) -> Metastore: + container = self.seed_metadata.containers[0][0] + return Metastore( + id=container.name, + name=container.name, + global_metastore_id=container.name, + metastore_id=container.name, + comment=None, + owner=None, + cloud=None, + region=None, + ) + + def catalogs(self, metastore: Metastore) -> Iterable[Catalog]: + for container in self.seed_metadata.containers[1]: + if not container.parent or metastore.name != container.parent.name: + continue + + yield Catalog( + id=f"{metastore.id}.{container.name}", + name=container.name, + metastore=metastore, + comment=None, + owner=None, + type=CatalogType.MANAGED_CATALOG, + ) + + def schemas(self, catalog: Catalog) -> Iterable[Schema]: + for container in self.seed_metadata.containers[2]: + # Assumes all catalog names are unique + if not container.parent or catalog.name != container.parent.name: + continue + + yield Schema( + id=f"{catalog.id}.{container.name}", + name=container.name, + catalog=catalog, + comment=None, + owner=None, + ) + + def tables(self, schema: Schema) -> Iterable[Table]: + for table in self._schema_to_table[schema.name]: + columns = [] + if table.column_mapping: + for i, col_name in enumerate(table.columns): + column = table.column_mapping[col_name] + columns.append( + Column( + id=column.name, + name=column.name, + type_name=self._convert_column_type(column.type), + type_text=column.type.value, + nullable=column.nullable, + position=i, + comment=None, + type_precision=0, + type_scale=0, + ) + ) + + yield Table( + id=f"{schema.id}.{table.name}", + name=table.name, + schema=schema, + table_type=TableType.VIEW if table.is_view() else TableType.MANAGED, + columns=columns, + created_at=datetime.now(tz=timezone.utc), + comment=None, + owner=None, + storage_location=None, + data_source_format=None, + generation=None, + created_by="", + updated_at=None, + updated_by=None, + table_id="", + view_definition=table.definition + if isinstance(table, data_model.View) + else None, + properties={}, + ) + + def service_principals(self) -> Iterable[ServicePrincipal]: + for i in range(self.num_service_principals): + yield ServicePrincipal( + id=str(i), + application_id=str(uuid.uuid4()), + display_name=f"user-{i}", + active=True, + ) + + def query_history( + self, + start_time: datetime, + end_time: datetime, + ) -> Iterable[Query]: + for i, query in enumerate(self.queries): + yield Query( + query_id=str(i), + query_text=query.text, + statement_type=self._convert_statement_type(query.type), + start_time=query.timestamp, + end_time=query.timestamp, + user_id=hash(query.actor), + user_name=query.actor, + executed_as_user_id=hash(query.actor), + executed_as_user_name=None, + ) + + def table_lineage(self, table: Table) -> None: + pass + + def get_column_lineage(self, table: Table) -> None: + pass + + @staticmethod + def _convert_column_type(t: ColumnType) -> ColumnTypeName: + if t == ColumnType.INTEGER: + return ColumnTypeName.INT + elif t == ColumnType.FLOAT: + return ColumnTypeName.DOUBLE + elif t == ColumnType.STRING: + return ColumnTypeName.STRING + elif t == ColumnType.BOOLEAN: + return ColumnTypeName.BOOLEAN + elif t == ColumnType.DATETIME: + return ColumnTypeName.TIMESTAMP + else: + raise ValueError(f"Unknown column type: {t}") + + @staticmethod + def _convert_statement_type(t: StatementType) -> QueryStatementType: + if t == "CUSTOM" or t == "UNKNOWN": + return QueryStatementType.OTHER + else: + return QueryStatementType[t] diff --git a/metadata-ingestion/tests/performance/helpers.py b/metadata-ingestion/tests/performance/helpers.py new file mode 100644 index 00000000000000..eb98e53670c963 --- /dev/null +++ b/metadata-ingestion/tests/performance/helpers.py @@ -0,0 +1,21 @@ +import os +from typing import Iterable, Tuple + +import psutil + +from datahub.ingestion.api.workunit import MetadataWorkUnit + + +def workunit_sink(workunits: Iterable[MetadataWorkUnit]) -> Tuple[int, int]: + peak_memory_usage = psutil.Process(os.getpid()).memory_info().rss + i: int = 0 + for i, wu in enumerate(workunits): + if i % 10_000 == 0: + peak_memory_usage = max( + peak_memory_usage, psutil.Process(os.getpid()).memory_info().rss + ) + peak_memory_usage = max( + peak_memory_usage, psutil.Process(os.getpid()).memory_info().rss + ) + + return i, peak_memory_usage diff --git a/metadata-ingestion/tests/test_helpers/docker_helpers.py b/metadata-ingestion/tests/test_helpers/docker_helpers.py index 30157c3a780947..2eb61068196a23 100644 --- a/metadata-ingestion/tests/test_helpers/docker_helpers.py +++ b/metadata-ingestion/tests/test_helpers/docker_helpers.py @@ -1,5 +1,6 @@ import contextlib import logging +import os import subprocess from typing import Callable, Optional, Union @@ -78,6 +79,10 @@ def run( def cleanup_image(image_name: str) -> None: assert ":" not in image_name, "image_name should not contain a tag" + if not os.environ.get("CI"): + logger.debug("Not cleaning up images to speed up local development") + return + images_proc = subprocess.run( f"docker image ls --filter 'reference={image_name}*' -q", shell=True, diff --git a/metadata-ingestion/tests/unit/api/entities/datacontract/__init__.py b/metadata-ingestion/tests/unit/api/entities/datacontract/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/metadata-ingestion/tests/unit/api/entities/datacontract/test_data_quality_assertion.py b/metadata-ingestion/tests/unit/api/entities/datacontract/test_data_quality_assertion.py new file mode 100644 index 00000000000000..7be8b667a500b3 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/entities/datacontract/test_data_quality_assertion.py @@ -0,0 +1,55 @@ +from datahub.api.entities.datacontract.data_quality_assertion import ( + DataQualityAssertion, +) +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.metadata.schema_classes import ( + AssertionInfoClass, + AssertionStdOperatorClass, + AssertionStdParameterClass, + AssertionStdParametersClass, + AssertionStdParameterTypeClass, + AssertionTypeClass, + AssertionValueChangeTypeClass, + SqlAssertionInfoClass, + SqlAssertionTypeClass, +) + + +def test_parse_sql_assertion(): + assertion_urn = "urn:li:assertion:a" + entity_urn = "urn:li:dataset:d" + statement = "SELECT COUNT(*) FROM my_table WHERE value IS NOT NULL" + + d = { + "type": "custom_sql", + "sql": statement, + "operator": {"type": "between", "min": 5, "max": 10}, + } + + assert DataQualityAssertion.parse_obj(d).generate_mcp( + assertion_urn, entity_urn + ) == [ + MetadataChangeProposalWrapper( + entityUrn=assertion_urn, + aspect=AssertionInfoClass( + type=AssertionTypeClass.SQL, + sqlAssertion=SqlAssertionInfoClass( + type=SqlAssertionTypeClass.METRIC, + changeType=AssertionValueChangeTypeClass.ABSOLUTE, + entity=entity_urn, + statement="SELECT COUNT(*) FROM my_table WHERE value IS NOT NULL", + operator=AssertionStdOperatorClass.BETWEEN, + parameters=AssertionStdParametersClass( + minValue=AssertionStdParameterClass( + value="5", + type=AssertionStdParameterTypeClass.NUMBER, + ), + maxValue=AssertionStdParameterClass( + value="10", + type=AssertionStdParameterTypeClass.NUMBER, + ), + ), + ), + ), + ) + ] diff --git a/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_less_upstreams_in_gms_aspect_golden.json b/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_less_upstreams_in_gms_aspect_golden.json new file mode 100644 index 00000000000000..812566143014b5 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_less_upstreams_in_gms_aspect_golden.json @@ -0,0 +1,106 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD)", + "type": "TRANSFORMED" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_a)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_a)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_b)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_b)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_c)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_c)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_a)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_a)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_a)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_b)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_b)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_b)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_c)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_c)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_c)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "run-id", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_more_upstreams_in_gms_aspect_golden.json b/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_more_upstreams_in_gms_aspect_golden.json new file mode 100644 index 00000000000000..17f4d10728268f --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/incremental_cll_more_upstreams_in_gms_aspect_golden.json @@ -0,0 +1,120 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD)", + "changeType": "UPSERT", + "aspectName": "upstreamLineage", + "aspect": { + "json": { + "upstreams": [ + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD)", + "type": "TRANSFORMED" + }, + { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream3,PROD)", + "type": "TRANSFORMED" + } + ], + "fineGrainedLineages": [ + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_a)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_a)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream3,PROD),col_a)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_a)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_b)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_b)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream3,PROD),col_b)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_b)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_c)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_c)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream3,PROD),col_c)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_c)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_a)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_a)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_a)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_b)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_b)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_b)" + ], + "confidenceScore": 1.0 + }, + { + "upstreamType": "FIELD_SET", + "upstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD),col_c)", + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD),col_c)" + ], + "downstreamType": "FIELD", + "downstreams": [ + "urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD),col_c)" + ], + "confidenceScore": 1.0 + } + ] + } + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "run-id", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/api/source_helpers/incremental_table_lineage_golden.json b/metadata-ingestion/tests/unit/api/source_helpers/incremental_table_lineage_golden.json new file mode 100644 index 00000000000000..c828373c73080b --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/incremental_table_lineage_golden.json @@ -0,0 +1,41 @@ +[ +{ + "entityType": "dataset", + "entityUrn": "urn:li:dataset:(urn:li:dataPlatform:platform,dataset1,PROD)", + "changeType": "PATCH", + "aspectName": "upstreamLineage", + "aspect": { + "json": [ + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aplatform%2Cupstream1%2CPROD%29", + "value": { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream1,PROD)", + "type": "TRANSFORMED" + } + }, + { + "op": "add", + "path": "/upstreams/urn%3Ali%3Adataset%3A%28urn%3Ali%3AdataPlatform%3Aplatform%2Cupstream2%2CPROD%29", + "value": { + "auditStamp": { + "time": 0, + "actor": "urn:li:corpuser:unknown" + }, + "dataset": "urn:li:dataset:(urn:li:dataPlatform:platform,upstream2,PROD)", + "type": "TRANSFORMED" + } + } + ] + }, + "systemMetadata": { + "lastObserved": 1643871600000, + "runId": "run-id", + "lastRunId": "no-run-id-provided" + } +} +] \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py new file mode 100644 index 00000000000000..e8485106c6a818 --- /dev/null +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_incremental_lineage_helper.py @@ -0,0 +1,261 @@ +from typing import List, Optional +from unittest.mock import MagicMock + +import pytest + +import datahub.metadata.schema_classes as models +from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn +from datahub.emitter.mcp import MetadataChangeProposalWrapper +from datahub.ingestion.api.incremental_lineage_helper import auto_incremental_lineage +from datahub.ingestion.api.workunit import MetadataWorkUnit +from datahub.ingestion.sink.file import write_metadata_file +from tests.test_helpers import mce_helpers + +platform = "platform" +system_metadata = models.SystemMetadataClass(lastObserved=1643871600000, runId="run-id") + + +def make_lineage_aspect( + dataset_name: str, + upstreams: List[str], + timestamp: int = 0, + columns: List[str] = [], + include_cll: bool = False, +) -> models.UpstreamLineageClass: + """ + Generates dataset properties and upstream lineage aspects + with simple column to column lineage between current dataset and all upstreams + """ + + dataset_urn = make_dataset_urn(platform, dataset_name) + return models.UpstreamLineageClass( + upstreams=[ + models.UpstreamClass( + dataset=upstream_urn, + type=models.DatasetLineageTypeClass.TRANSFORMED, + auditStamp=models.AuditStampClass( + time=timestamp, actor="urn:li:corpuser:unknown" + ), + ) + for upstream_urn in upstreams + ], + fineGrainedLineages=[ + models.FineGrainedLineageClass( + upstreamType=models.FineGrainedLineageUpstreamTypeClass.FIELD_SET, + downstreamType=models.FineGrainedLineageDownstreamTypeClass.FIELD, + upstreams=[ + make_schema_field_urn(upstream_urn, col) + for upstream_urn in upstreams + ], + downstreams=[make_schema_field_urn(dataset_urn, col)], + ) + for col in columns + ] + if include_cll + else None, + ) + + +def base_table_lineage_aspect() -> models.UpstreamLineageClass: + return make_lineage_aspect( + "dataset1", + upstreams=[ + make_dataset_urn(platform, name) for name in ["upstream1", "upstream2"] + ], + ) + + +def base_cll_aspect(timestamp: int = 0) -> models.UpstreamLineageClass: + return make_lineage_aspect( + "dataset1", + upstreams=[ + make_dataset_urn(platform, name) for name in ["upstream1", "upstream2"] + ], + timestamp=timestamp, + columns=["col_a", "col_b", "col_c"], + include_cll=True, + ) + + +def test_incremental_table_lineage(tmp_path, pytestconfig): + test_resources_dir = pytestconfig.rootpath / "tests/unit/api/source_helpers" + test_file = tmp_path / "incremental_table_lineage.json" + golden_file = test_resources_dir / "incremental_table_lineage_golden.json" + + urn = make_dataset_urn(platform, "dataset1") + aspect = base_table_lineage_aspect() + + processed_wus = auto_incremental_lineage( + graph=None, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=urn, aspect=aspect, systemMetadata=system_metadata + ).as_workunit() + ], + ) + + write_metadata_file( + test_file, + [wu.metadata for wu in processed_wus], + ) + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, output_path=test_file, golden_path=golden_file + ) + + +def test_incremental_table_lineage_empty_upstreams(tmp_path, pytestconfig): + + urn = make_dataset_urn(platform, "dataset1") + aspect = make_lineage_aspect( + "dataset1", + upstreams=[], + ) + + processed_wus = auto_incremental_lineage( + graph=None, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=urn, aspect=aspect, systemMetadata=system_metadata + ).as_workunit() + ], + ) + + assert [wu.metadata for wu in processed_wus] == [] + + +@pytest.mark.parametrize( + "gms_aspect,current_aspect,output_aspect", + [ + # emitting CLL upstreamLineage over table level upstreamLineage + [ + base_table_lineage_aspect(), + base_cll_aspect(), + base_cll_aspect(), + ], + # emitting upstreamLineage for the first time + [ + None, + base_cll_aspect(), + base_cll_aspect(), + ], + # emitting CLL upstreamLineage over same CLL upstreamLineage + [ + base_cll_aspect(), + base_cll_aspect(), + base_cll_aspect(), + ], + # emitting CLL upstreamLineage over same CLL upstreamLineage but with earlier timestamp + [ + base_cll_aspect(), # default timestamp is 0 + base_cll_aspect(timestamp=1643871600000), + base_cll_aspect(timestamp=1643871600000), + ], + ], +) +def test_incremental_column_level_lineage( + gms_aspect: Optional[models.UpstreamLineageClass], + current_aspect: models.UpstreamLineageClass, + output_aspect: models.UpstreamLineageClass, +) -> None: + mock_graph = MagicMock() + mock_graph.get_aspect.return_value = gms_aspect + dataset_urn = make_dataset_urn(platform, "dataset1") + + processed_wus = auto_incremental_lineage( + graph=mock_graph, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=dataset_urn, + aspect=current_aspect, + systemMetadata=system_metadata, + ).as_workunit() + ], + ) + + wu: MetadataWorkUnit = next(iter(processed_wus)) + aspect = wu.get_aspect_of_type(models.UpstreamLineageClass) + assert aspect == output_aspect + + +def test_incremental_column_lineage_less_upstreams_in_gms_aspect( + tmp_path, pytestconfig +): + test_resources_dir = pytestconfig.rootpath / "tests/unit/api/source_helpers" + test_file = tmp_path / "incremental_cll_less_upstreams_in_gms_aspect.json" + golden_file = ( + test_resources_dir / "incremental_cll_less_upstreams_in_gms_aspect_golden.json" + ) + + urn = make_dataset_urn(platform, "dataset1") + aspect = base_cll_aspect() + + mock_graph = MagicMock() + mock_graph.get_aspect.return_value = make_lineage_aspect( + "dataset1", + upstreams=[make_dataset_urn(platform, name) for name in ["upstream1"]], + columns=["col_a", "col_b", "col_c"], + include_cll=True, + ) + + processed_wus = auto_incremental_lineage( + graph=mock_graph, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=urn, aspect=aspect, systemMetadata=system_metadata + ).as_workunit() + ], + ) + + write_metadata_file( + test_file, + [wu.metadata for wu in processed_wus], + ) + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, output_path=test_file, golden_path=golden_file + ) + + +def test_incremental_column_lineage_more_upstreams_in_gms_aspect( + tmp_path, pytestconfig +): + test_resources_dir = pytestconfig.rootpath / "tests/unit/api/source_helpers" + test_file = tmp_path / "incremental_cll_more_upstreams_in_gms_aspect.json" + golden_file = ( + test_resources_dir / "incremental_cll_more_upstreams_in_gms_aspect_golden.json" + ) + + urn = make_dataset_urn(platform, "dataset1") + aspect = base_cll_aspect() + + mock_graph = MagicMock() + mock_graph.get_aspect.return_value = make_lineage_aspect( + "dataset1", + upstreams=[ + make_dataset_urn(platform, name) + for name in ["upstream1", "upstream2", "upstream3"] + ], + columns=["col_a", "col_b", "col_c"], + include_cll=True, + ) + + processed_wus = auto_incremental_lineage( + graph=mock_graph, + incremental_lineage=True, + stream=[ + MetadataChangeProposalWrapper( + entityUrn=urn, aspect=aspect, systemMetadata=system_metadata + ).as_workunit() + ], + ) + + write_metadata_file( + test_file, + [wu.metadata for wu in processed_wus], + ) + mce_helpers.check_golden_file( + pytestconfig=pytestconfig, output_path=test_file, golden_path=golden_file + ) diff --git a/metadata-ingestion/tests/unit/test_source_helpers.py b/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py similarity index 86% rename from metadata-ingestion/tests/unit/test_source_helpers.py rename to metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py index b6ec6ebce240c8..b667af8bb41e98 100644 --- a/metadata-ingestion/tests/unit/test_source_helpers.py +++ b/metadata-ingestion/tests/unit/api/source_helpers/test_source_helpers.py @@ -16,6 +16,7 @@ from datahub.ingestion.api.source_helpers import ( auto_browse_path_v2, auto_empty_dataset_usage_statistics, + auto_lowercase_urns, auto_status_aspect, auto_workunit, ) @@ -275,6 +276,75 @@ def test_auto_browse_path_v2_legacy_browse_path(telemetry_ping_mock): assert paths["platform,dataset-2,PROD)"] == _make_browse_path_entries(["something"]) +def test_auto_lowercase_aspects(): + mcws = auto_workunit( + [ + MetadataChangeProposalWrapper( + entityUrn=make_dataset_urn( + "bigquery", "myProject.mySchema.myTable", "PROD" + ), + aspect=models.DatasetKeyClass( + "urn:li:dataPlatform:bigquery", "myProject.mySchema.myTable", "PROD" + ), + ), + MetadataChangeProposalWrapper( + entityUrn="urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a", + aspect=models.ContainerPropertiesClass( + name="test", + ), + ), + models.MetadataChangeEventClass( + proposedSnapshot=models.DatasetSnapshotClass( + urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-Public-Data.Covid19_Aha.staffing,PROD)", + aspects=[ + models.DatasetPropertiesClass( + customProperties={ + "key": "value", + }, + ), + ], + ), + ), + ] + ) + + expected = [ + *list( + auto_workunit( + [ + MetadataChangeProposalWrapper( + entityUrn="urn:li:dataset:(urn:li:dataPlatform:bigquery,myproject.myschema.mytable,PROD)", + aspect=models.DatasetKeyClass( + "urn:li:dataPlatform:bigquery", + "myProject.mySchema.myTable", + "PROD", + ), + ), + MetadataChangeProposalWrapper( + entityUrn="urn:li:container:008e111aa1d250dd52e0fd5d4b307b1a", + aspect=models.ContainerPropertiesClass( + name="test", + ), + ), + models.MetadataChangeEventClass( + proposedSnapshot=models.DatasetSnapshotClass( + urn="urn:li:dataset:(urn:li:dataPlatform:bigquery,bigquery-public-data.covid19_aha.staffing,PROD)", + aspects=[ + models.DatasetPropertiesClass( + customProperties={ + "key": "value", + }, + ), + ], + ), + ), + ] + ) + ), + ] + assert list(auto_lowercase_urns(mcws)) == expected + + @patch("datahub.ingestion.api.source_helpers.telemetry.telemetry_instance.ping") def test_auto_browse_path_v2_container_over_legacy_browse_path(telemetry_ping_mock): structure = {"a": {"b": ["c"]}} diff --git a/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py b/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py index cbd5be9e7d832b..4a69deb572fbd7 100644 --- a/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py +++ b/metadata-ingestion/tests/unit/data_lake/test_schema_inference.py @@ -1,14 +1,14 @@ import tempfile from typing import List, Type -import avro.schema import pandas as pd import ujson from avro import schema as avro_schema from avro.datafile import DataFileWriter from avro.io import DatumWriter -from datahub.ingestion.source.schema_inference import avro, csv_tsv, json, parquet +from datahub.ingestion.source.schema_inference import csv_tsv, json, parquet +from datahub.ingestion.source.schema_inference.avro import AvroInferrer from datahub.metadata.com.linkedin.pegasus2avro.schema import ( BooleanTypeClass, NumberTypeClass, @@ -123,7 +123,7 @@ def test_infer_schema_avro(): file.seek(0) - fields = avro.AvroInferrer().infer_schema(file) + fields = AvroInferrer().infer_schema(file) fields.sort(key=lambda x: x.fieldPath) assert_field_paths_match(fields, expected_field_paths_avro) diff --git a/metadata-ingestion/tests/unit/serde/test_serde.py b/metadata-ingestion/tests/unit/serde/test_serde.py index d116f1f5473faf..d2d6a0bdda5b9b 100644 --- a/metadata-ingestion/tests/unit/serde/test_serde.py +++ b/metadata-ingestion/tests/unit/serde/test_serde.py @@ -238,7 +238,7 @@ def test_missing_optional_simple() -> None: "criteria": [ { "condition": "EQUALS", - "field": "RESOURCE_TYPE", + "field": "TYPE", "values": ["notebook", "dataset", "dashboard"], } ] @@ -252,7 +252,7 @@ def test_missing_optional_simple() -> None: "criteria": [ { "condition": "EQUALS", - "field": "RESOURCE_TYPE", + "field": "TYPE", "values": ["notebook", "dataset", "dashboard"], } ] @@ -267,13 +267,13 @@ def test_missing_optional_simple() -> None: def test_missing_optional_in_union() -> None: # This one doesn't contain any optional fields and should work fine. revised_json = json.loads( - '{"lastUpdatedTimestamp":1662356745807,"actors":{"groups":[],"resourceOwners":false,"allUsers":true,"allGroups":false,"users":[]},"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"displayName":"customtest","resources":{"filter":{"criteria":[{"field":"RESOURCE_TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]},"allResources":false},"description":"","state":"ACTIVE","type":"METADATA"}' + '{"lastUpdatedTimestamp":1662356745807,"actors":{"groups":[],"resourceOwners":false,"allUsers":true,"allGroups":false,"users":[]},"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"displayName":"customtest","resources":{"filter":{"criteria":[{"field":"TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]},"allResources":false},"description":"","state":"ACTIVE","type":"METADATA"}' ) revised = models.DataHubPolicyInfoClass.from_obj(revised_json) # This one is missing the optional filters.allResources field. original_json = json.loads( - '{"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"actors":{"resourceOwners":false,"groups":[],"allGroups":false,"allUsers":true,"users":[]},"lastUpdatedTimestamp":1662356745807,"displayName":"customtest","description":"","resources":{"filter":{"criteria":[{"field":"RESOURCE_TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]}},"state":"ACTIVE","type":"METADATA"}' + '{"privileges":["EDIT_ENTITY_ASSERTIONS","EDIT_DATASET_COL_GLOSSARY_TERMS","EDIT_DATASET_COL_TAGS","EDIT_DATASET_COL_DESCRIPTION"],"actors":{"resourceOwners":false,"groups":[],"allGroups":false,"allUsers":true,"users":[]},"lastUpdatedTimestamp":1662356745807,"displayName":"customtest","description":"","resources":{"filter":{"criteria":[{"field":"TYPE","condition":"EQUALS","values":["notebook","dataset","dashboard"]}]}},"state":"ACTIVE","type":"METADATA"}' ) original = models.DataHubPolicyInfoClass.from_obj(original_json) diff --git a/metadata-ingestion/tests/unit/serde/test_urn_iterator.py b/metadata-ingestion/tests/unit/serde/test_urn_iterator.py index 9657ac45068da9..135580dcdff13e 100644 --- a/metadata-ingestion/tests/unit/serde/test_urn_iterator.py +++ b/metadata-ingestion/tests/unit/serde/test_urn_iterator.py @@ -1,4 +1,5 @@ import datahub.emitter.mce_builder as builder +from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.metadata.com.linkedin.pegasus2avro.dataset import ( DatasetLineageTypeClass, FineGrainedLineage, @@ -10,11 +11,11 @@ from datahub.utilities.urns.urn_iter import list_urns_with_path, lowercase_dataset_urns -def _datasetUrn(tbl): +def _datasetUrn(tbl: str) -> str: return builder.make_dataset_urn("bigquery", tbl, "PROD") -def _fldUrn(tbl, fld): +def _fldUrn(tbl: str, fld: str) -> str: return builder.make_schema_field_urn(_datasetUrn(tbl), fld) @@ -114,8 +115,10 @@ def test_upstream_lineage_urn_iterator(): ] -def _make_test_lineage_obj(upstream: str, downstream: str) -> UpstreamLineage: - return UpstreamLineage( +def _make_test_lineage_obj( + table: str, upstream: str, downstream: str +) -> MetadataChangeProposalWrapper: + lineage = UpstreamLineage( upstreams=[ Upstream( dataset=_datasetUrn(upstream), @@ -132,11 +135,17 @@ def _make_test_lineage_obj(upstream: str, downstream: str) -> UpstreamLineage: ], ) + return MetadataChangeProposalWrapper(entityUrn=_datasetUrn(table), aspect=lineage) + def test_dataset_urn_lowercase_transformer(): - original = _make_test_lineage_obj("upstreamTable", "downstreamTable") + original = _make_test_lineage_obj( + "mainTableName", "upstreamTable", "downstreamTable" + ) - expected = _make_test_lineage_obj("upstreamtable", "downstreamtable") + expected = _make_test_lineage_obj( + "maintablename", "upstreamtable", "downstreamtable" + ) assert original != expected # sanity check diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json index e50d944ce72e36..d610b0a83f2290 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_create_view_with_cte.json @@ -12,7 +12,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj-2.dataset.my_view,PROD)", - "column": "col5" + "column": "col5", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { @@ -24,7 +30,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj-2.dataset.my_view,PROD)", - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { @@ -36,7 +48,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj-2.dataset.my_view,PROD)", - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { @@ -48,7 +66,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-proj-2.dataset.my_view,PROD)", - "column": "col3" + "column": "col3", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json index 78591286feb505..2d3d188d28316d 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_from_sharded_table_wildcard.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json index 0e93d31fbb6a66..41ae0885941b00 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_nested_subqueries.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json index 78591286feb505..2d3d188d28316d 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_sharded_table_normalization.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json index 17a801a63e3ffc..26f8f8f59a3ff6 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_star_with_replace.json @@ -10,7 +10,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-project.my-dataset.test_table,PROD)", - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { @@ -22,7 +28,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-project.my-dataset.test_table,PROD)", - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { @@ -34,7 +46,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-project.my-dataset.test_table,PROD)", - "column": "something" + "column": "something", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json index fd8a586ac74ac0..83365c09f69c20 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_bigquery_view_from_union.json @@ -11,7 +11,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_view,PROD)", - "column": "col1" + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { @@ -27,7 +33,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_view,PROD)", - "column": "col2" + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "STRING" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json new file mode 100644 index 00000000000000..cf31b71cb50f6b --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_table_ddl.json @@ -0,0 +1,61 @@ +{ + "query_type": "CREATE", + "in_tables": [], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)", + "column": "id", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INTEGER" + }, + "upstreams": [] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)", + "column": "month", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" + }, + "upstreams": [] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)", + "column": "total_cost", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "REAL" + }, + "upstreams": [] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:sqlite,costs,PROD)", + "column": "area", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "REAL" + }, + "upstreams": [] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json index 1ca56840531e46..8a6b60d0f1bde8 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_create_view_as_select.json @@ -10,7 +10,9 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:oracle,vsal,PROD)", - "column": "Department" + "column": "Department", + "column_type": null, + "native_column_type": null }, "upstreams": [ { @@ -22,14 +24,22 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:oracle,vsal,PROD)", - "column": "Employees" + "column": "Employees", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "NUMBER" }, "upstreams": [] }, { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:oracle,vsal,PROD)", - "column": "Salary" + "column": "Salary", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json index e241bdd08e243c..eecb2265eaec55 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_expand_select_star_basic.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "total_agg" + "column": "total_agg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "orderkey" + "column": "orderkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -32,7 +44,13 @@ { "downstream": { "table": null, - "column": "custkey" + "column": "custkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -44,7 +62,13 @@ { "downstream": { "table": null, - "column": "orderstatus" + "column": "orderstatus", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -56,7 +80,13 @@ { "downstream": { "table": null, - "column": "totalprice" + "column": "totalprice", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { @@ -68,7 +98,13 @@ { "downstream": { "table": null, - "column": "orderdate" + "column": "orderdate", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "native_column_type": "DATE" }, "upstreams": [ { @@ -80,7 +116,13 @@ { "downstream": { "table": null, - "column": "orderpriority" + "column": "orderpriority", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -92,7 +134,13 @@ { "downstream": { "table": null, - "column": "clerk" + "column": "clerk", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { @@ -104,7 +152,13 @@ { "downstream": { "table": null, - "column": "shippriority" + "column": "shippriority", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -116,7 +170,13 @@ { "downstream": { "table": null, - "column": "comment" + "column": "comment", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "TEXT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_as_select.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_as_select.json index d7264fd2db6b28..326db47e7ab333 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_as_select.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_insert_as_select.json @@ -18,21 +18,27 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "i_item_desc" + "column": "i_item_desc", + "column_type": null, + "native_column_type": null }, "upstreams": [] }, { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "w_warehouse_name" + "column": "w_warehouse_name", + "column_type": null, + "native_column_type": null }, "upstreams": [] }, { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "d_week_seq" + "column": "d_week_seq", + "column_type": null, + "native_column_type": null }, "upstreams": [ { @@ -44,7 +50,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "no_promo" + "column": "no_promo", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [ { @@ -56,7 +68,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "promo" + "column": "promo", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [ { @@ -68,7 +86,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:hive,query72,PROD)", - "column": "total_cnt" + "column": "total_cnt", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_postgres_select_subquery.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_postgres_select_subquery.json new file mode 100644 index 00000000000000..0c40ce120c9342 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_postgres_select_subquery.json @@ -0,0 +1,64 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.table1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.table2,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "a", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.table1,PROD)", + "column": "a" + } + ] + }, + { + "downstream": { + "table": null, + "column": "b", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.table1,PROD)", + "column": "b" + } + ] + }, + { + "downstream": { + "table": null, + "column": "c", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.ArrayType": {} + } + }, + "native_column_type": "INT[]" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.table2,PROD)", + "column": "c" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json index 10f5ee20b0c1f1..b5fd5eebeb1b19 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json @@ -9,21 +9,27 @@ { "downstream": { "table": null, - "column": "a" + "column": "a", + "column_type": null, + "native_column_type": null }, "upstreams": [] }, { "downstream": { "table": null, - "column": "b" + "column": "b", + "column_type": null, + "native_column_type": null }, "upstreams": [] }, { "downstream": { "table": null, - "column": "c" + "column": "c", + "column_type": null, + "native_column_type": null }, "upstreams": [] } diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_count.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_count.json index 9f6eeae46c2940..a67c944822138f 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_count.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_count.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "COUNT(`fact_complaint_snapshot`.`etl_data_dt_id`)" + "column": "COUNT(`fact_complaint_snapshot`.`etl_data_dt_id`)", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json index 109de961804227..2424fcda347524 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_struct_subfields.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "post_id" + "column": "post_id", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "NUMERIC" }, "upstreams": [ { @@ -20,7 +26,9 @@ { "downstream": { "table": null, - "column": "id" + "column": "id", + "column_type": null, + "native_column_type": null }, "upstreams": [ { @@ -32,7 +40,9 @@ { "downstream": { "table": null, - "column": "min_metric" + "column": "min_metric", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json index 2340b2e95b0d0b..5d1d421f49a2aa 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_from_union.json @@ -1,5 +1,5 @@ { - "query_type": "UNKNOWN", + "query_type": "SELECT", "in_tables": [ "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf10.orders,PROD)", "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf100.orders,PROD)" @@ -9,14 +9,26 @@ { "downstream": { "table": null, - "column": "label" + "column": "label", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" }, "upstreams": [] }, { "downstream": { "table": null, - "column": "total_agg" + "column": "total_agg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_max.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_max.json index 326c07d332c268..6ea88f45847ce7 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_max.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_max.json @@ -8,7 +8,9 @@ { "downstream": { "table": null, - "column": "max_col" + "column": "max_col", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_ctes.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_ctes.json index 3e02314d6e8c39..67e9fd2d21a0e4 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_ctes.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_ctes.json @@ -9,7 +9,9 @@ { "downstream": { "table": null, - "column": "COL1" + "column": "COL1", + "column_type": null, + "native_column_type": null }, "upstreams": [ { @@ -21,7 +23,9 @@ { "downstream": { "table": null, - "column": "COL3" + "column": "COL3", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json index c12ad23b2f03b0..8dd2633eff6128 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_with_full_col_name.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "post_id" + "column": "post_id", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "NUMERIC" }, "upstreams": [ { @@ -20,7 +26,9 @@ { "downstream": { "table": null, - "column": "id" + "column": "id", + "column_type": null, + "native_column_type": null }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json index 64cd80e9a2d697..a876824127ec11 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_case_statement.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "total_price_category" + "column": "total_price_category", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "total_price_success" + "column": "total_price_success", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_cast.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_cast.json new file mode 100644 index 00000000000000..7545e2b3269dc0 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_cast.json @@ -0,0 +1,63 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "orderkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL(20, 0)" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "o_orderkey" + } + ] + }, + { + "downstream": { + "table": null, + "column": "total_cast_int", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "o_totalprice" + } + ] + }, + { + "downstream": { + "table": null, + "column": "total_cast_float", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL(16, 4)" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "o_totalprice" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json index 7b22a46757e392..84e6b053000f18 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_column_normalization.json @@ -8,7 +8,13 @@ { "downstream": { "table": null, - "column": "total_agg" + "column": "total_agg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -20,7 +26,13 @@ { "downstream": { "table": null, - "column": "total_avg" + "column": "total_avg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -32,7 +44,13 @@ { "downstream": { "table": null, - "column": "total_min" + "column": "total_min", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { @@ -44,7 +62,13 @@ { "downstream": { "table": null, - "column": "total_max" + "column": "total_max", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json index c912d99a3a8a32..39c94cf83c561b 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_ctas_column_normalization.json @@ -10,7 +10,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", - "column": "Total_Agg" + "column": "Total_Agg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -22,7 +28,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", - "column": "total_avg" + "column": "total_avg", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DOUBLE" }, "upstreams": [ { @@ -34,7 +46,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", - "column": "TOTAL_MIN" + "column": "TOTAL_MIN", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { @@ -46,7 +64,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders_normalized,PROD)", - "column": "total_max" + "column": "total_max", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "FLOAT" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json index 2af308ec606234..dbf5b1b9a44535 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_default_normalization.json @@ -11,7 +11,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "user_fk" + "column": "user_fk", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL(38, 0)" }, "upstreams": [ { @@ -23,7 +29,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "email" + "column": "email", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR(16777216)" }, "upstreams": [ { @@ -35,7 +47,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "last_purchase_date" + "column": "last_purchase_date", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.DateType": {} + } + }, + "native_column_type": "DATE" }, "upstreams": [ { @@ -47,7 +65,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "lifetime_purchase_amount" + "column": "lifetime_purchase_amount", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { @@ -59,7 +83,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "lifetime_purchase_count" + "column": "lifetime_purchase_count", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "BIGINT" }, "upstreams": [ { @@ -71,7 +101,13 @@ { "downstream": { "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,long_tail_companions.analytics.active_customer_ltv,PROD)", - "column": "average_purchase_amount" + "column": "average_purchase_amount", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" }, "upstreams": [ { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json new file mode 100644 index 00000000000000..d51001f9697992 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_from_table.json @@ -0,0 +1,57 @@ +{ + "query_type": "UPDATE", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "column": "col1", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "column": "col1" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "column": "col2" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)", + "column": "col2", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.StringType": {} + } + }, + "native_column_type": "VARCHAR" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)", + "column": "col1" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)", + "column": "col2" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json new file mode 100644 index 00000000000000..f421b28530c64d --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_hardcoded.json @@ -0,0 +1,37 @@ +{ + "query_type": "UPDATE", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "orderkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "totalprice", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "INT" + }, + "upstreams": [] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_self.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_self.json new file mode 100644 index 00000000000000..c8cc32164a3eb4 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_snowflake_update_self.json @@ -0,0 +1,29 @@ +{ + "query_type": "UPDATE", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "orderkey", + "column_type": { + "type": { + "com.linkedin.pegasus2avro.schema.NumberType": {} + } + }, + "native_column_type": "DECIMAL" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)", + "column": "orderkey" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json new file mode 100644 index 00000000000000..ee80285d87f60b --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_default_normalization.json @@ -0,0 +1,40 @@ +{ + "query_type": "CREATE", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_diagnoses,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_features,PROD)" + ], + "out_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)" + ], + "column_lineage": [ + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)", + "column": "PatientId", + "column_type": null, + "native_column_type": "INTEGER()" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_diagnoses,PROD)", + "column": "PatientId" + } + ] + }, + { + "downstream": { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)", + "column": "BMI", + "column_type": null, + "native_column_type": "FLOAT()" + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_features,PROD)", + "column": "BMI" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json new file mode 100644 index 00000000000000..4b21a2512ccd11 --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_teradata_strange_operators.json @@ -0,0 +1,46 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table2,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "col1", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table1,PROD)", + "column": "col1" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table2,PROD)", + "column": "col1" + } + ] + }, + { + "downstream": { + "table": null, + "column": "col2", + "column_type": null, + "native_column_type": null + }, + "upstreams": [ + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table1,PROD)", + "column": "col2" + }, + { + "table": "urn:li:dataset:(urn:li:dataPlatform:teradata,dbc.table2,PROD)", + "column": "col2" + } + ] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 483c1ac4cc7f93..c420f2b8438ce0 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -3,6 +3,7 @@ import pytest from datahub.testing.check_sql_parser_result import assert_sql_result +from datahub.utilities.sqlglot_lineage import _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT RESOURCE_DIR = pathlib.Path(__file__).parent / "goldens" @@ -274,6 +275,21 @@ def test_expand_select_star_basic(): ) +def test_create_table_ddl(): + assert_sql_result( + """ +CREATE TABLE IF NOT EXISTS costs ( + id INTEGER PRIMARY KEY, + month TEXT NOT NULL, + total_cost REAL NOT NULL, + area REAL NOT NULL +) +""", + dialect="sqlite", + expected_file=RESOURCE_DIR / "test_create_table_ddl.json", + ) + + def test_snowflake_column_normalization(): # Technically speaking this is incorrect since the column names are different and both quoted. @@ -593,4 +609,291 @@ def test_snowflake_default_normalization(): ) +def test_snowflake_column_cast(): + assert_sql_result( + """ +SELECT + o.o_orderkey::NUMBER(20,0) as orderkey, + CAST(o.o_totalprice AS INT) as total_cast_int, + CAST(o.o_totalprice AS NUMBER(16,4)) as total_cast_float +FROM snowflake_sample_data.tpch_sf1.orders o +LIMIT 10 +""", + dialect="snowflake", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)": { + "orderkey": "NUMBER(38,0)", + "totalprice": "NUMBER(12,2)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_column_cast.json", + ) + + # TODO: Add a test for setting platform_instance or env + + +def test_teradata_default_normalization(): + assert_sql_result( + """ +create table demo_user.test_lineage2 as + ( + select + ppd.PatientId, + ppf.bmi + from + demo_user.pima_patient_features ppf + join demo_user.pima_patient_diagnoses ppd on + ppd.PatientId = ppf.PatientId + ) with data; +""", + dialect="teradata", + default_schema="dbc", + platform_instance="myteradata", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_diagnoses,PROD)": { + "HasDiabetes": "INTEGER()", + "PatientId": "INTEGER()", + }, + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.pima_patient_features,PROD)": { + "Age": "INTEGER()", + "BMI": "FLOAT()", + "BloodP": "INTEGER()", + "DiPedFunc": "FLOAT()", + "NumTimesPrg": "INTEGER()", + "PatientId": "INTEGER()", + "PlGlcConc": "INTEGER()", + "SkinThick": "INTEGER()", + "TwoHourSerIns": "INTEGER()", + }, + "urn:li:dataset:(urn:li:dataPlatform:teradata,myteradata.demo_user.test_lineage2,PROD)": { + "BMI": "FLOAT()", + "PatientId": "INTEGER()", + }, + }, + expected_file=RESOURCE_DIR / "test_teradata_default_normalization.json", + ) + + +def test_teradata_strange_operators(): + # This is a test for the following operators: + # - `SEL` (select) + # - `EQ` (equals) + # - `MINUS` (except) + assert_sql_result( + """ +sel col1, col2 from dbc.table1 +where col1 eq 'value1' +minus +select col1, col2 from dbc.table2 +""", + dialect="teradata", + default_schema="dbc", + expected_file=RESOURCE_DIR / "test_teradata_strange_operators.json", + ) + + +@pytest.mark.skip("sqlglot doesn't support this cast syntax yet") +def test_teradata_cast_syntax(): + assert_sql_result( + """ +SELECT my_table.date_col MONTH(4) AS month_col +FROM my_table +""", + dialect="teradata", + default_schema="dbc", + expected_file=RESOURCE_DIR / "test_teradata_cast_syntax.json", + ) + + +def test_snowflake_update_hardcoded(): + assert_sql_result( + """ +UPDATE snowflake_sample_data.tpch_sf1.orders +SET orderkey = 1, totalprice = 2 +WHERE orderkey = 3 +""", + dialect="snowflake", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)": { + "orderkey": "NUMBER(38,0)", + "totalprice": "NUMBER(12,2)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_update_hardcoded.json", + ) + + +def test_update_from_select(): + assert _UPDATE_ARGS_NOT_SUPPORTED_BY_SELECT == {"returning", "this"} + + +def test_snowflake_update_from_table(): + # Can create these tables with the following SQL: + """ + -- Create or replace my_table + CREATE OR REPLACE TABLE my_table ( + id INT IDENTITY PRIMARY KEY, + col1 VARCHAR(50), + col2 VARCHAR(50) + ); + + -- Create or replace table1 + CREATE OR REPLACE TABLE table1 ( + id INT IDENTITY PRIMARY KEY, + col1 VARCHAR(50), + col2 VARCHAR(50) + ); + + -- Create or replace table2 + CREATE OR REPLACE TABLE table2 ( + id INT IDENTITY PRIMARY KEY, + col2 VARCHAR(50) + ); + + -- Insert data into my_table + INSERT INTO my_table (col1, col2) + VALUES ('foo', 'bar'), + ('baz', 'qux'); + + -- Insert data into table1 + INSERT INTO table1 (col1, col2) + VALUES ('foo', 'bar'), + ('baz', 'qux'); + + -- Insert data into table2 + INSERT INTO table2 (col2) + VALUES ('bar'), + ('qux'); + """ + + assert_sql_result( + """ +UPDATE my_table +SET + col1 = t1.col1 || t1.col2, + col2 = t1.col1 || t2.col2 +FROM table1 t1 +JOIN table2 t2 ON t1.id = t2.id +WHERE my_table.id = t1.id; +""", + dialect="snowflake", + default_db="my_db", + default_schema="my_schema", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.my_table,PROD)": { + "id": "NUMBER(38,0)", + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table1,PROD)": { + "id": "NUMBER(38,0)", + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:snowflake,my_db.my_schema.table2,PROD)": { + "id": "NUMBER(38,0)", + "col1": "VARCHAR(16777216)", + "col2": "VARCHAR(16777216)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_update_from_table.json", + ) + + +def test_snowflake_update_self(): + assert_sql_result( + """ +UPDATE snowflake_sample_data.tpch_sf1.orders +SET orderkey = orderkey + 1 +""", + dialect="snowflake", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:snowflake,snowflake_sample_data.tpch_sf1.orders,PROD)": { + "orderkey": "NUMBER(38,0)", + "totalprice": "NUMBER(12,2)", + }, + }, + expected_file=RESOURCE_DIR / "test_snowflake_update_self.json", + ) + + +def test_postgres_select_subquery(): + assert_sql_result( + """ +SELECT + a, + b, + (SELECT c FROM table2 WHERE table2.id = table1.id) as c +FROM table1 +""", + dialect="postgres", + default_db="my_db", + default_schema="my_schema", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.table1,PROD)": { + "id": "INTEGER", + "a": "INTEGER", + "b": "INTEGER", + }, + "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.table2,PROD)": { + "id": "INTEGER", + "c": "INTEGER", + }, + }, + expected_file=RESOURCE_DIR / "test_postgres_select_subquery.json", + ) + + +@pytest.mark.skip(reason="We can't parse column-list syntax with sub-selects yet") +def test_postgres_update_subselect(): + assert_sql_result( + """ +UPDATE accounts SET sales_person_name = + (SELECT name FROM employees + WHERE employees.id = accounts.sales_person_id) +""", + dialect="postgres", + default_db="my_db", + default_schema="my_schema", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.accounts,PROD)": { + "id": "INTEGER", + "sales_person_id": "INTEGER", + "sales_person_name": "VARCHAR(16777216)", + }, + "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.employees,PROD)": { + "id": "INTEGER", + "name": "VARCHAR(16777216)", + }, + }, + expected_file=RESOURCE_DIR / "test_postgres_update_subselect.json", + ) + + +@pytest.mark.skip(reason="We can't parse column-list syntax with sub-selects yet") +def test_postgres_complex_update(): + # Example query from the postgres docs: + # https://www.postgresql.org/docs/current/sql-update.html + assert_sql_result( + """ +UPDATE accounts SET (contact_first_name, contact_last_name) = + (SELECT first_name, last_name FROM employees + WHERE employees.id = accounts.sales_person); +""", + dialect="postgres", + schemas={ + "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.accounts,PROD)": { + "id": "INTEGER", + "contact_first_name": "VARCHAR(16777216)", + "contact_last_name": "VARCHAR(16777216)", + "sales_person": "INTEGER", + }, + "urn:li:dataset:(urn:li:dataPlatform:postgres,my_db.my_schema.employees,PROD)": { + "id": "INTEGER", + "first_name": "VARCHAR(16777216)", + "last_name": "VARCHAR(16777216)", + }, + }, + expected_file=RESOURCE_DIR / "test_postgres_complex_update.json", + ) diff --git a/metadata-ingestion/tests/unit/test_athena_source.py b/metadata-ingestion/tests/unit/test_athena_source.py index 2558f6a46715e6..23dd7dd5a6e45f 100644 --- a/metadata-ingestion/tests/unit/test_athena_source.py +++ b/metadata-ingestion/tests/unit/test_athena_source.py @@ -3,14 +3,17 @@ import pytest from freezegun import freeze_time +from sqlalchemy import types +from sqlalchemy_bigquery import STRUCT from datahub.ingestion.api.common import PipelineContext -from src.datahub.ingestion.source.aws.s3_util import make_s3_urn +from datahub.ingestion.source.aws.s3_util import make_s3_urn +from datahub.ingestion.source.sql.athena import CustomAthenaRestDialect +from datahub.utilities.sqlalchemy_type_converter import MapType FROZEN_TIME = "2020-04-14 07:00:00" -@pytest.mark.integration def test_athena_config_query_location_old_plus_new_value_not_allowed(): from datahub.ingestion.source.sql.athena import AthenaConfig @@ -25,7 +28,6 @@ def test_athena_config_query_location_old_plus_new_value_not_allowed(): ) -@pytest.mark.integration def test_athena_config_staging_dir_is_set_as_query_result(): from datahub.ingestion.source.sql.athena import AthenaConfig @@ -48,7 +50,6 @@ def test_athena_config_staging_dir_is_set_as_query_result(): assert config.json() == expected_config.json() -@pytest.mark.integration def test_athena_uri(): from datahub.ingestion.source.sql.athena import AthenaConfig @@ -59,9 +60,12 @@ def test_athena_uri(): "work_group": "test-workgroup", } ) - assert ( - config.get_sql_alchemy_url() - == "awsathena+rest://@athena.us-west-1.amazonaws.com:443/?s3_staging_dir=s3%3A%2F%2Fquery-result-location%2F&work_group=test-workgroup&catalog_name=awsdatacatalog&duration_seconds=3600" + assert config.get_sql_alchemy_url() == ( + "awsathena+rest://@athena.us-west-1.amazonaws.com:443" + "?catalog_name=awsdatacatalog" + "&duration_seconds=3600" + "&s3_staging_dir=s3%3A%2F%2Fquery-result-location%2F" + "&work_group=test-workgroup" ) @@ -104,7 +108,7 @@ def test_athena_get_table_properties(): mock_cursor = mock.MagicMock() mock_inspector = mock.MagicMock() mock_inspector.engine.raw_connection().cursor.return_value = mock_cursor - mock_cursor._get_table_metadata.return_value = AthenaTableMetadata( + mock_cursor.get_table_metadata.return_value = AthenaTableMetadata( response=table_metadata ) @@ -126,3 +130,81 @@ def test_athena_get_table_properties(): } assert location == make_s3_urn("s3://testLocation", "PROD") + + +def test_get_column_type_simple_types(): + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="int"), types.Integer + ) + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="string"), types.String + ) + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="boolean"), types.BOOLEAN + ) + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="long"), types.BIGINT + ) + assert isinstance( + CustomAthenaRestDialect()._get_column_type(type_="double"), types.FLOAT + ) + + +def test_get_column_type_array(): + result = CustomAthenaRestDialect()._get_column_type(type_="array") + + assert isinstance(result, types.ARRAY) + assert isinstance(result.item_type, types.String) + + +def test_get_column_type_map(): + result = CustomAthenaRestDialect()._get_column_type(type_="map") + + assert isinstance(result, MapType) + assert isinstance(result.types[0], types.String) + assert isinstance(result.types[1], types.Integer) + + +def test_column_type_struct(): + + result = CustomAthenaRestDialect()._get_column_type(type_="struct") + + assert isinstance(result, STRUCT) + assert isinstance(result._STRUCT_fields[0], tuple) + assert result._STRUCT_fields[0][0] == "test" + assert isinstance(result._STRUCT_fields[0][1], types.String) + + +def test_column_type_complex_combination(): + + result = CustomAthenaRestDialect()._get_column_type( + type_="struct>>" + ) + + assert isinstance(result, STRUCT) + + assert isinstance(result._STRUCT_fields[0], tuple) + assert result._STRUCT_fields[0][0] == "id" + assert isinstance(result._STRUCT_fields[0][1], types.String) + + assert isinstance(result._STRUCT_fields[1], tuple) + assert result._STRUCT_fields[1][0] == "name" + assert isinstance(result._STRUCT_fields[1][1], types.String) + + assert isinstance(result._STRUCT_fields[2], tuple) + assert result._STRUCT_fields[2][0] == "choices" + assert isinstance(result._STRUCT_fields[2][1], types.ARRAY) + + assert isinstance(result._STRUCT_fields[2][1].item_type, STRUCT) + + assert isinstance(result._STRUCT_fields[2][1].item_type._STRUCT_fields[0], tuple) + assert result._STRUCT_fields[2][1].item_type._STRUCT_fields[0][0] == "id" + assert isinstance( + result._STRUCT_fields[2][1].item_type._STRUCT_fields[0][1], types.String + ) + + assert isinstance(result._STRUCT_fields[2][1].item_type._STRUCT_fields[1], tuple) + assert result._STRUCT_fields[2][1].item_type._STRUCT_fields[1][0] == "label" + assert isinstance( + result._STRUCT_fields[2][1].item_type._STRUCT_fields[1][1], types.String + ) diff --git a/metadata-ingestion/tests/unit/test_bigquery_source.py b/metadata-ingestion/tests/unit/test_bigquery_source.py index 4fc6c31626ba82..4cfa5c48d23771 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_source.py +++ b/metadata-ingestion/tests/unit/test_bigquery_source.py @@ -3,13 +3,14 @@ import os from datetime import datetime, timedelta, timezone from types import SimpleNamespace -from typing import Any, Dict, Optional, cast +from typing import Any, Dict, List, Optional, cast from unittest.mock import MagicMock, Mock, patch import pytest from google.api_core.exceptions import GoogleAPICallError from google.cloud.bigquery.table import Row, TableListItem +from datahub.configuration.common import AllowDenyPattern from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.bigquery_v2.bigquery import BigqueryV2Source from datahub.ingestion.source.bigquery_v2.bigquery_audit import ( @@ -17,9 +18,13 @@ BigqueryTableIdentifier, BigQueryTableRef, ) -from datahub.ingestion.source.bigquery_v2.bigquery_config import BigQueryV2Config +from datahub.ingestion.source.bigquery_v2.bigquery_config import ( + BigQueryConnectionConfig, + BigQueryV2Config, +) from datahub.ingestion.source.bigquery_v2.bigquery_report import BigQueryV2Report from datahub.ingestion.source.bigquery_v2.bigquery_schema import ( + BigqueryDataset, BigqueryProject, BigQuerySchemaApi, BigqueryView, @@ -48,6 +53,59 @@ def test_bigquery_uri_on_behalf(): assert config.get_sql_alchemy_url() == "bigquery://test-project-on-behalf" +def test_bigquery_dataset_pattern(): + config = BigQueryV2Config.parse_obj( + { + "dataset_pattern": { + "allow": [ + "test-dataset", + "test-project.test-dataset", + ".*test-dataset", + ], + "deny": [ + "^test-dataset-2$", + "project\\.second_dataset", + ], + }, + } + ) + assert config.dataset_pattern.allow == [ + r".*\.test-dataset", + r"test-project.test-dataset", + r".*test-dataset", + ] + assert config.dataset_pattern.deny == [ + r"^.*\.test-dataset-2$", + r"project\.second_dataset", + ] + + config = BigQueryV2Config.parse_obj( + { + "dataset_pattern": { + "allow": [ + "test-dataset", + "test-project.test-dataset", + ".*test-dataset", + ], + "deny": [ + "^test-dataset-2$", + "project\\.second_dataset", + ], + }, + "match_fully_qualified_names": False, + } + ) + assert config.dataset_pattern.allow == [ + r"test-dataset", + r"test-project.test-dataset", + r".*test-dataset", + ] + assert config.dataset_pattern.deny == [ + r"^test-dataset-2$", + r"project\.second_dataset", + ] + + def test_bigquery_uri_with_credential(): expected_credential_json = { "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", @@ -760,11 +818,14 @@ def test_gen_view_dataset_workunits( ("project.dataset.table_20231215", "project.dataset.table", "20231215"), ("project.dataset.table_2023", "project.dataset.table_2023", None), # incorrectly handled special case where dataset itself is a sharded table if full name is specified - ("project.dataset.20231215", "project.dataset.20231215", None), + ("project.dataset.20231215", "project.dataset.20231215", "20231215"), + ("project1.dataset2.20231215", "project1.dataset2.20231215", "20231215"), # Cases with Just the table name as input ("table", "table", None), - ("table20231215", "table20231215", None), + ("table20231215", "table", "20231215"), ("table_20231215", "table", "20231215"), + ("table2_20231215", "table2", "20231215"), + ("table220231215", "table220231215", None), ("table_1624046611000_name", "table_1624046611000_name", None), ("table_1624046611000", "table_1624046611000", None), # Special case where dataset itself is a sharded table @@ -796,7 +857,6 @@ def test_get_table_and_shard_default( ("project.dataset.2023", "project.dataset.2023", None), # Cases with Just the table name as input ("table", "table", None), - ("table20231215", "table20231215", None), ("table_20231215", "table", "20231215"), ("table_2023", "table", "2023"), ("table_1624046611000_name", "table_1624046611000_name", None), @@ -837,7 +897,7 @@ def test_get_table_and_shard_custom_shard_pattern( "project.dataset.table_1624046611000_name", ), ("project.dataset.table_1624046611000", "project.dataset.table_1624046611000"), - ("project.dataset.table20231215", "project.dataset.table20231215"), + ("project.dataset.table20231215", "project.dataset.table"), ("project.dataset.table_*", "project.dataset.table"), ("project.dataset.table_2023*", "project.dataset.table"), ("project.dataset.table_202301*", "project.dataset.table"), @@ -854,3 +914,47 @@ def test_get_table_name(full_table_name: str, datahub_full_table_name: str) -> N BigqueryTableIdentifier.from_string_name(full_table_name).get_table_name() == datahub_full_table_name ) + + +def test_default_config_for_excluding_projects_and_datasets(): + config = BigQueryV2Config.parse_obj({}) + assert config.exclude_empty_projects is False + config = BigQueryV2Config.parse_obj({"exclude_empty_projects": True}) + assert config.exclude_empty_projects + + +@patch.object(BigQueryConnectionConfig, "get_bigquery_client", new=lambda self: None) +@patch.object(BigQuerySchemaApi, "get_datasets_for_project_id") +def test_excluding_empty_projects_from_ingestion( + get_datasets_for_project_id_mock, +): + project_id_with_datasets = "project-id-with-datasets" + project_id_without_datasets = "project-id-without-datasets" + + def get_datasets_for_project_id_side_effect( + project_id: str, + ) -> List[BigqueryDataset]: + return ( + [] + if project_id == project_id_without_datasets + else [BigqueryDataset("some-dataset")] + ) + + get_datasets_for_project_id_mock.side_effect = ( + get_datasets_for_project_id_side_effect + ) + + base_config = { + "project_ids": [project_id_with_datasets, project_id_without_datasets], + "schema_pattern": AllowDenyPattern(deny=[".*"]), + "include_usage_statistics": False, + "include_table_lineage": False, + } + + config = BigQueryV2Config.parse_obj(base_config) + source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test-1")) + assert len({wu.metadata.entityUrn for wu in source.get_workunits()}) == 2 # type: ignore + + config = BigQueryV2Config.parse_obj({**base_config, "exclude_empty_projects": True}) + source = BigqueryV2Source(config=config, ctx=PipelineContext(run_id="test-2")) + assert len({wu.metadata.entityUrn for wu in source.get_workunits()}) == 1 # type: ignore diff --git a/metadata-ingestion/tests/unit/test_bigquery_usage.py b/metadata-ingestion/tests/unit/test_bigquery_usage.py index e06c6fb3fe7e5b..1eb5d8b00e27c9 100644 --- a/metadata-ingestion/tests/unit/test_bigquery_usage.py +++ b/metadata-ingestion/tests/unit/test_bigquery_usage.py @@ -35,7 +35,7 @@ TimeWindowSizeClass, ) from datahub.testing.compare_metadata_json import diff_metadata_json -from tests.performance.bigquery import generate_events, ref_from_table +from tests.performance.bigquery.bigquery_events import generate_events, ref_from_table from tests.performance.data_generation import generate_data, generate_queries from tests.performance.data_model import Container, FieldAccess, Query, Table, View @@ -45,14 +45,15 @@ ACTOR_2, ACTOR_2_URN = "b@acryl.io", "urn:li:corpuser:b" DATABASE_1 = Container("database_1") DATABASE_2 = Container("database_2") -TABLE_1 = Table("table_1", DATABASE_1, ["id", "name", "age"]) -TABLE_2 = Table("table_2", DATABASE_1, ["id", "table_1_id", "value"]) +TABLE_1 = Table("table_1", DATABASE_1, ["id", "name", "age"], None) +TABLE_2 = Table("table_2", DATABASE_1, ["id", "table_1_id", "value"], None) VIEW_1 = View( name="view_1", container=DATABASE_1, columns=["id", "name", "total"], definition="VIEW DEFINITION 1", parents=[TABLE_1, TABLE_2], + column_mapping=None, ) ALL_TABLES = [TABLE_1, TABLE_2, VIEW_1] diff --git a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py index 4cf42da4395f94..44fd840f28d594 100644 --- a/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py +++ b/metadata-ingestion/tests/unit/test_bigqueryv2_usage_source.py @@ -144,10 +144,10 @@ def test_bigquery_table_sanitasitation(): assert new_table_ref.dataset == "dataset-4567" table_ref = BigQueryTableRef( - BigqueryTableIdentifier("project-1234", "dataset-4567", "foo_20222110") + BigqueryTableIdentifier("project-1234", "dataset-4567", "foo_20221210") ) new_table_identifier = table_ref.table_identifier - assert new_table_identifier.table == "foo_20222110" + assert new_table_identifier.table == "foo_20221210" assert new_table_identifier.is_sharded_table() assert new_table_identifier.get_table_display_name() == "foo" assert new_table_identifier.project_id == "project-1234" diff --git a/metadata-ingestion/tests/unit/test_clickhouse_source.py b/metadata-ingestion/tests/unit/test_clickhouse_source.py index de7e7d66f21290..1b2ffb70c8d190 100644 --- a/metadata-ingestion/tests/unit/test_clickhouse_source.py +++ b/metadata-ingestion/tests/unit/test_clickhouse_source.py @@ -26,9 +26,7 @@ def test_clickhouse_uri_native(): "scheme": "clickhouse+native", } ) - assert ( - config.get_sql_alchemy_url() == "clickhouse+native://user:password@host:1111/" - ) + assert config.get_sql_alchemy_url() == "clickhouse+native://user:password@host:1111" def test_clickhouse_uri_native_secure(): diff --git a/metadata-ingestion/tests/unit/test_mapping.py b/metadata-ingestion/tests/unit/test_mapping.py index d69dd4a8a96b0d..5c258f16535f88 100644 --- a/metadata-ingestion/tests/unit/test_mapping.py +++ b/metadata-ingestion/tests/unit/test_mapping.py @@ -4,6 +4,7 @@ from datahub.metadata.schema_classes import ( GlobalTagsClass, GlossaryTermsClass, + InstitutionalMemoryClass, OwnerClass, OwnershipClass, OwnershipSourceTypeClass, @@ -233,6 +234,46 @@ def test_operation_processor_advanced_matching_tags(): assert tag_aspect.tags[0].tag == "urn:li:tag:case_4567" +def test_operation_processor_institutional_memory(): + raw_props = { + "documentation_link": "https://test.com/documentation#ignore-this", + } + processor = OperationProcessor( + operation_defs={ + "documentation_link": { + "match": r"(?:https?)?\:\/\/\w*[^#]*", + "operation": "add_doc_link", + "config": {"link": "{{ $match }}", "description": "test"}, + }, + }, + ) + aspect_map = processor.process(raw_props) + assert "add_doc_link" in aspect_map + + doc_link_aspect: InstitutionalMemoryClass = aspect_map["add_doc_link"] + + assert doc_link_aspect.elements[0].url == "https://test.com/documentation" + assert doc_link_aspect.elements[0].description == "test" + + +def test_operation_processor_institutional_memory_no_description(): + raw_props = { + "documentation_link": "test.com/documentation#ignore-this", + } + processor = OperationProcessor( + operation_defs={ + "documentation_link": { + "match": r"(?:https?)?\:\/\/\w*[^#]*", + "operation": "add_doc_link", + "config": {"link": "{{ $match }}"}, + }, + }, + ) + # we require a description, so this should stay empty + aspect_map = processor.process(raw_props) + assert aspect_map == {} + + def test_operation_processor_matching_nested_props(): raw_props = { "gdpr": { diff --git a/metadata-ingestion/tests/unit/test_mcp_builder.py b/metadata-ingestion/tests/unit/test_mcp_builder.py index 23f2bddc2084e8..e304edb24789cd 100644 --- a/metadata-ingestion/tests/unit/test_mcp_builder.py +++ b/metadata-ingestion/tests/unit/test_mcp_builder.py @@ -1,5 +1,5 @@ import datahub.emitter.mcp_builder as builder -from datahub.emitter.mce_builder import datahub_guid +from datahub.metadata.schema_classes import StatusClass, TelemetryClientIdClass def test_guid_generator(): @@ -80,7 +80,15 @@ def test_guid_generators(): key = builder.SchemaKey( database="test", schema="Test", platform="mysql", instance="TestInstance" ) - guid_datahub = datahub_guid(key.dict(by_alias=True)) + guid_datahub = key.guid() guid = key.guid() assert guid == guid_datahub + + +def test_entity_supports_aspect(): + assert builder.entity_supports_aspect("dataset", StatusClass) + assert not builder.entity_supports_aspect("telemetry", StatusClass) + + assert not builder.entity_supports_aspect("dataset", TelemetryClientIdClass) + assert builder.entity_supports_aspect("telemetry", TelemetryClientIdClass) diff --git a/metadata-ingestion/tests/unit/test_pydantic_validators.py b/metadata-ingestion/tests/unit/test_pydantic_validators.py index 07d86043a35bf8..3e9ec6cbaf3579 100644 --- a/metadata-ingestion/tests/unit/test_pydantic_validators.py +++ b/metadata-ingestion/tests/unit/test_pydantic_validators.py @@ -4,7 +4,7 @@ from pydantic import ValidationError from datahub.configuration.common import ConfigModel -from datahub.configuration.pydantic_field_deprecation import pydantic_field_deprecated +from datahub.configuration.validate_field_deprecation import pydantic_field_deprecated from datahub.configuration.validate_field_removal import pydantic_removed_field from datahub.configuration.validate_field_rename import pydantic_renamed_field from datahub.utilities.global_warning_util import get_global_warnings diff --git a/metadata-ingestion/tests/unit/test_redshift_config.py b/metadata-ingestion/tests/unit/test_redshift_config.py new file mode 100644 index 00000000000000..8a165e7f5f3fe3 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_redshift_config.py @@ -0,0 +1,6 @@ +from datahub.ingestion.source.redshift.config import RedshiftConfig + + +def test_incremental_lineage_default_to_false(): + config = RedshiftConfig(host_port="localhost:5439", database="test") + assert config.incremental_lineage is False diff --git a/metadata-ingestion/tests/unit/test_redshift_lineage.py b/metadata-ingestion/tests/unit/test_redshift_lineage.py index c7d6ac18e044cb..db5af3a71efb99 100644 --- a/metadata-ingestion/tests/unit/test_redshift_lineage.py +++ b/metadata-ingestion/tests/unit/test_redshift_lineage.py @@ -1,6 +1,8 @@ +from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.source.redshift.config import RedshiftConfig from datahub.ingestion.source.redshift.lineage import RedshiftLineageExtractor from datahub.ingestion.source.redshift.report import RedshiftReport +from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, DownstreamColumnRef def test_get_sources_from_query(): @@ -10,14 +12,20 @@ def test_get_sources_from_query(): test_query = """ select * from my_schema.my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test.my_schema.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test.my_schema.my_table,PROD)" + ) def test_get_sources_from_query_with_only_table_name(): @@ -27,14 +35,20 @@ def test_get_sources_from_query_with_only_table_name(): test_query = """ select * from my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test.public.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test.public.my_table,PROD)" + ) def test_get_sources_from_query_with_database(): @@ -44,14 +58,20 @@ def test_get_sources_from_query_with_database(): test_query = """ select * from test.my_schema.my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test.my_schema.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test.my_schema.my_table,PROD)" + ) def test_get_sources_from_query_with_non_default_database(): @@ -61,14 +81,20 @@ def test_get_sources_from_query_with_non_default_database(): test_query = """ select * from test2.my_schema.my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test2.my_schema.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test2.my_schema.my_table,PROD)" + ) def test_get_sources_from_query_with_only_table(): @@ -78,27 +104,48 @@ def test_get_sources_from_query_with_only_table(): test_query = """ select * from my_table """ - lineage_extractor = RedshiftLineageExtractor(config, report) - lineage_datasets = lineage_extractor._get_sources_from_query( + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + lineage_datasets, _ = lineage_extractor._get_sources_from_query( db_name="test", query=test_query ) assert len(lineage_datasets) == 1 lineage = lineage_datasets[0] - assert lineage.path == "test.public.my_table" + + assert ( + lineage.urn + == "urn:li:dataset:(urn:li:dataPlatform:redshift,test.public.my_table,PROD)" + ) -def test_get_sources_from_query_with_four_part_table_should_throw_exception(): +def test_cll(): config = RedshiftConfig(host_port="localhost:5439", database="test") report = RedshiftReport() test_query = """ - select * from database.schema.my_table.test + select a,b,c from db.public.customer inner join db.public.order on db.public.customer.id = db.public.order.customer_id """ - lineage_extractor = RedshiftLineageExtractor(config, report) - try: - lineage_extractor._get_sources_from_query(db_name="test", query=test_query) - except ValueError: - pass - - assert f"{test_query} should have thrown a ValueError exception but it didn't" + lineage_extractor = RedshiftLineageExtractor( + config, report, PipelineContext(run_id="foo") + ) + _, cll = lineage_extractor._get_sources_from_query(db_name="db", query=test_query) + + assert cll == [ + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="a"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="b"), + upstreams=[], + logic=None, + ), + ColumnLineageInfo( + downstream=DownstreamColumnRef(table=None, column="c"), + upstreams=[], + logic=None, + ), + ] diff --git a/metadata-ingestion/tests/unit/test_snowflake_source.py b/metadata-ingestion/tests/unit/test_snowflake_source.py index 1c26ca2487e5ca..888a7c04415542 100644 --- a/metadata-ingestion/tests/unit/test_snowflake_source.py +++ b/metadata-ingestion/tests/unit/test_snowflake_source.py @@ -179,10 +179,12 @@ def test_snowflake_uri_default_authentication(): } ) - assert ( - config.get_sql_alchemy_url() - == "snowflake://user:password@acctname/?authenticator=SNOWFLAKE&warehouse=COMPUTE_WH&role" - "=sysadmin&application=acryl_datahub" + assert config.get_sql_alchemy_url() == ( + "snowflake://user:password@acctname" + "?application=acryl_datahub" + "&authenticator=SNOWFLAKE" + "&role=sysadmin" + "&warehouse=COMPUTE_WH" ) @@ -198,10 +200,12 @@ def test_snowflake_uri_external_browser_authentication(): } ) - assert ( - config.get_sql_alchemy_url() - == "snowflake://user@acctname/?authenticator=EXTERNALBROWSER&warehouse=COMPUTE_WH&role" - "=sysadmin&application=acryl_datahub" + assert config.get_sql_alchemy_url() == ( + "snowflake://user@acctname" + "?application=acryl_datahub" + "&authenticator=EXTERNALBROWSER" + "&role=sysadmin" + "&warehouse=COMPUTE_WH" ) @@ -219,10 +223,12 @@ def test_snowflake_uri_key_pair_authentication(): } ) - assert ( - config.get_sql_alchemy_url() - == "snowflake://user@acctname/?authenticator=SNOWFLAKE_JWT&warehouse=COMPUTE_WH&role" - "=sysadmin&application=acryl_datahub" + assert config.get_sql_alchemy_url() == ( + "snowflake://user@acctname" + "?application=acryl_datahub" + "&authenticator=SNOWFLAKE_JWT" + "&role=sysadmin" + "&warehouse=COMPUTE_WH" ) diff --git a/metadata-ingestion/tests/unit/test_sql_common.py b/metadata-ingestion/tests/unit/test_sql_common.py index 95af0e623e9911..808b38192411dc 100644 --- a/metadata-ingestion/tests/unit/test_sql_common.py +++ b/metadata-ingestion/tests/unit/test_sql_common.py @@ -4,12 +4,11 @@ import pytest from sqlalchemy.engine.reflection import Inspector -from datahub.ingestion.source.sql.sql_common import ( - PipelineContext, - SQLAlchemySource, +from datahub.ingestion.source.sql.sql_common import PipelineContext, SQLAlchemySource +from datahub.ingestion.source.sql.sql_config import SQLCommonConfig +from datahub.ingestion.source.sql.sqlalchemy_uri_mapper import ( get_platform_from_sqlalchemy_uri, ) -from datahub.ingestion.source.sql.sql_config import SQLCommonConfig class _TestSQLAlchemyConfig(SQLCommonConfig): diff --git a/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py b/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py new file mode 100644 index 00000000000000..0384e1f9188812 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_ratelimiter.py @@ -0,0 +1,20 @@ +from collections import defaultdict +from datetime import datetime +from typing import Dict + +from datahub.utilities.ratelimiter import RateLimiter + + +def test_rate_is_limited(): + MAX_CALLS_PER_SEC = 5 + TOTAL_CALLS = 18 + actual_calls: Dict[float, int] = defaultdict(lambda: 0) + + ratelimiter = RateLimiter(max_calls=MAX_CALLS_PER_SEC, period=1) + for _ in range(TOTAL_CALLS): + with ratelimiter: + actual_calls[datetime.now().replace(microsecond=0).timestamp()] += 1 + + assert len(actual_calls) == round(TOTAL_CALLS / MAX_CALLS_PER_SEC) + assert all(calls <= MAX_CALLS_PER_SEC for calls in actual_calls.values()) + assert sum(actual_calls.values()) == TOTAL_CALLS diff --git a/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py b/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py new file mode 100644 index 00000000000000..6c719d351c4c20 --- /dev/null +++ b/metadata-ingestion/tests/unit/utilities/test_sqlalchemy_type_converter.py @@ -0,0 +1,93 @@ +from typing import no_type_check + +from sqlalchemy import types +from sqlalchemy_bigquery import STRUCT + +from datahub.metadata.schema_classes import ( + ArrayTypeClass, + MapTypeClass, + NullTypeClass, + NumberTypeClass, + RecordTypeClass, +) +from datahub.utilities.sqlalchemy_type_converter import ( + MapType, + get_schema_fields_for_sqlalchemy_column, +) + + +def test_get_avro_schema_for_sqlalchemy_column(): + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=types.INTEGER() + ) + assert len(schema_fields) == 1 + assert schema_fields[0].fieldPath == "[version=2.0].[type=int].test" + assert schema_fields[0].type.type == NumberTypeClass() + assert schema_fields[0].nativeDataType == "INTEGER" + assert schema_fields[0].nullable is True + + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=types.String(), nullable=False + ) + assert len(schema_fields) == 1 + assert schema_fields[0].fieldPath == "[version=2.0].[type=string].test" + assert schema_fields[0].type.type == NumberTypeClass() + assert schema_fields[0].nativeDataType == "VARCHAR" + assert schema_fields[0].nullable is False + + +def test_get_avro_schema_for_sqlalchemy_array_column(): + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=types.ARRAY(types.FLOAT()) + ) + assert len(schema_fields) == 1 + assert ( + schema_fields[0].fieldPath + == "[version=2.0].[type=struct].[type=array].[type=float].test" + ) + assert schema_fields[0].type.type == ArrayTypeClass(nestedType=["float"]) + assert schema_fields[0].nativeDataType == "array" + + +def test_get_avro_schema_for_sqlalchemy_map_column(): + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=MapType(types.String(), types.BOOLEAN()) + ) + assert len(schema_fields) == 1 + assert ( + schema_fields[0].fieldPath + == "[version=2.0].[type=struct].[type=map].[type=boolean].test" + ) + assert schema_fields[0].type.type == MapTypeClass( + keyType="string", valueType="boolean" + ) + assert schema_fields[0].nativeDataType == "MapType(String(), BOOLEAN())" + + +def test_get_avro_schema_for_sqlalchemy_struct_column() -> None: + + schema_fields = get_schema_fields_for_sqlalchemy_column( + column_name="test", column_type=STRUCT(("test", types.INTEGER())) + ) + assert len(schema_fields) == 2 + assert ( + schema_fields[0].fieldPath == "[version=2.0].[type=struct].[type=struct].test" + ) + assert schema_fields[0].type.type == RecordTypeClass() + assert schema_fields[0].nativeDataType == "STRUCT" + + assert ( + schema_fields[1].fieldPath + == "[version=2.0].[type=struct].[type=struct].test.[type=int].test" + ) + assert schema_fields[1].type.type == NumberTypeClass() + assert schema_fields[1].nativeDataType == "INTEGER" + + +@no_type_check +def test_get_avro_schema_for_sqlalchemy_unknown_column(): + schema_fields = get_schema_fields_for_sqlalchemy_column("invalid", "test") + assert len(schema_fields) == 1 + assert schema_fields[0].type.type == NullTypeClass() + assert schema_fields[0].fieldPath == "[version=2.0].[type=null]" + assert schema_fields[0].nativeDataType == "test" diff --git a/metadata-integration/java/datahub-client/build.gradle b/metadata-integration/java/datahub-client/build.gradle index 95de3cdb3c5262..0bf6b18fa50731 100644 --- a/metadata-integration/java/datahub-client/build.gradle +++ b/metadata-integration/java/datahub-client/build.gradle @@ -30,7 +30,7 @@ dependencies { implementation(externalDependency.kafkaAvroSerializer) { exclude group: "org.apache.avro" } - implementation externalDependency.avro_1_7 + implementation externalDependency.avro constraints { implementation('commons-collections:commons-collections:3.2.2') { because 'Vulnerability Issue' @@ -97,6 +97,7 @@ shadowJar { // we can move to automatic relocation using ConfigureShadowRelocation after we get to a good place on these first relocate 'org.springframework', 'datahub.shaded.org.springframework' relocate 'com.fasterxml.jackson', 'datahub.shaded.jackson' + relocate 'org.yaml', 'io.acryl.shaded.org.yaml' // Required for shading snakeyaml relocate 'net.jcip.annotations', 'datahub.shaded.annotations' relocate 'javassist', 'datahub.shaded.javassist' relocate 'edu.umd.cs.findbugs', 'datahub.shaded.findbugs' @@ -242,4 +243,4 @@ checkstyleMain.exclude '**/generated/**' clean { project.delete("$projectDir/generated") -} \ No newline at end of file +} diff --git a/metadata-integration/java/datahub-client/src/main/java/datahub/client/kafka/AvroSerializer.java b/metadata-integration/java/datahub-client/src/main/java/datahub/client/kafka/AvroSerializer.java index ee0d459aaa7d3b..6212e57470be4f 100644 --- a/metadata-integration/java/datahub-client/src/main/java/datahub/client/kafka/AvroSerializer.java +++ b/metadata-integration/java/datahub-client/src/main/java/datahub/client/kafka/AvroSerializer.java @@ -16,12 +16,14 @@ class AvroSerializer { private final Schema _recordSchema; private final Schema _genericAspectSchema; + private final Schema _changeTypeEnumSchema; private final EventFormatter _eventFormatter; public AvroSerializer() throws IOException { _recordSchema = new Schema.Parser() .parse(this.getClass().getClassLoader().getResourceAsStream("MetadataChangeProposal.avsc")); _genericAspectSchema = this._recordSchema.getField("aspect").schema().getTypes().get(1); + _changeTypeEnumSchema = this._recordSchema.getField("changeType").schema(); _eventFormatter = new EventFormatter(EventFormatter.Format.PEGASUS_JSON); } @@ -43,7 +45,7 @@ public GenericRecord serialize(MetadataChangeProposal mcp) throws IOException { genericRecord.put("aspect", genericAspect); genericRecord.put("aspectName", mcp.getAspectName()); genericRecord.put("entityType", mcp.getEntityType()); - genericRecord.put("changeType", mcp.getChangeType()); + genericRecord.put("changeType", new GenericData.EnumSymbol(_changeTypeEnumSchema, mcp.getChangeType())); return genericRecord; } } \ No newline at end of file diff --git a/metadata-integration/java/datahub-protobuf/README.md b/metadata-integration/java/datahub-protobuf/README.md index daea8d438679c8..29b82aa3e68f5e 100644 --- a/metadata-integration/java/datahub-protobuf/README.md +++ b/metadata-integration/java/datahub-protobuf/README.md @@ -1,6 +1,6 @@ # Protobuf Schemas -The `datahub-protobuf` module is designed to be used with the Java Emitter, the input is a compiled protobuf binary `*.protoc` files and optionally the corresponding `*.proto` source code. In addition, you can supply the root message in cases where a single protobuf source file includes multiple non-nested messages. +The `datahub-protobuf` module is designed to be used with the Java Emitter, the input is a compiled protobuf binary `*.protoc` files and optionally the corresponding `*.proto` source code. You can supply a file with multiple nested messages to be processed. If you have a file with multiple non-nested messages, you will need to separate them out into different files or supply the root message, as otherwise we will only process the first one. ## Supported Features diff --git a/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/model/ProtobufField.java b/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/model/ProtobufField.java index 42884241d9f7cd..d890c373f12994 100644 --- a/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/model/ProtobufField.java +++ b/metadata-integration/java/datahub-protobuf/src/main/java/datahub/protobuf/model/ProtobufField.java @@ -259,7 +259,9 @@ private FieldDescriptorProto getNestedTypeFields(List pathList, Descrip messageType = messageType.getNestedType(value); } - if (pathList.get(pathSize - 2) == DescriptorProto.FIELD_FIELD_NUMBER) { + if (pathList.get(pathSize - 2) == DescriptorProto.FIELD_FIELD_NUMBER + && pathList.get(pathSize - 1) != DescriptorProto.RESERVED_RANGE_FIELD_NUMBER + && pathList.get(pathSize - 1) != DescriptorProto.RESERVED_NAME_FIELD_NUMBER) { return messageType.getField(pathList.get(pathSize - 1)); } else { return null; diff --git a/metadata-io/build.gradle b/metadata-io/build.gradle index ad54cf65243982..740fed61f13d56 100644 --- a/metadata-io/build.gradle +++ b/metadata-io/build.gradle @@ -8,9 +8,9 @@ configurations { dependencies { implementation project(':entity-registry') api project(':metadata-utils') - api project(':metadata-events:mxe-avro-1.7') + api project(':metadata-events:mxe-avro') api project(':metadata-events:mxe-registration') - api project(':metadata-events:mxe-utils-avro-1.7') + api project(':metadata-events:mxe-utils-avro') api project(':metadata-models') api project(':metadata-service:restli-client') api project(':metadata-service:configuration') diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java index 13a7d16b723a78..ceaf37a1289d99 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/client/CachingEntitySearchService.java @@ -256,13 +256,13 @@ public ScrollResult getCachedScrollResults( cacheAccess.stop(); if (result == null) { Timer.Context cacheMiss = MetricUtils.timer(this.getClass(), "scroll_cache_miss").time(); - result = getRawScrollResults(entities, query, filters, sortCriterion, scrollId, keepAlive, size, isFullText); + result = getRawScrollResults(entities, query, filters, sortCriterion, scrollId, keepAlive, size, isFullText, flags); cache.put(cacheKey, toJsonString(result)); cacheMiss.stop(); MetricUtils.counter(this.getClass(), "scroll_cache_miss_count").inc(); } } else { - result = getRawScrollResults(entities, query, filters, sortCriterion, scrollId, keepAlive, size, isFullText); + result = getRawScrollResults(entities, query, filters, sortCriterion, scrollId, keepAlive, size, isFullText, flags); } return result; } @@ -328,7 +328,8 @@ private ScrollResult getRawScrollResults( @Nullable final String scrollId, @Nullable final String keepAlive, final int count, - final boolean fulltext) { + final boolean fulltext, + @Nullable final SearchFlags searchFlags) { if (fulltext) { return entitySearchService.fullTextScroll( entities, @@ -337,7 +338,8 @@ private ScrollResult getRawScrollResults( sortCriterion, scrollId, keepAlive, - count); + count, + searchFlags); } else { return entitySearchService.structuredScroll(entities, input, @@ -345,7 +347,8 @@ private ScrollResult getRawScrollResults( sortCriterion, scrollId, keepAlive, - count); + count, + searchFlags); } } diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java index ef5a555e95ba89..024cf2b0abec23 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/ElasticSearchService.java @@ -175,23 +175,26 @@ public List getBrowsePaths(@Nonnull String entityName, @Nonnull Urn urn) @Nonnull @Override public ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size) { + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size, @Nullable SearchFlags searchFlags) { log.debug(String.format( "Scrolling Structured Search documents entities: %s, input: %s, postFilters: %s, sortCriterion: %s, scrollId: %s, size: %s", entities, input, postFilters, sortCriterion, scrollId, size)); + SearchFlags flags = Optional.ofNullable(searchFlags).orElse(new SearchFlags()); + flags.setFulltext(true); return esSearchDAO.scroll(entities, input, postFilters, sortCriterion, scrollId, keepAlive, size, - new SearchFlags().setFulltext(true)); + flags); } @Nonnull @Override public ScrollResult structuredScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size) { + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nullable String keepAlive, int size, @Nullable SearchFlags searchFlags) { log.debug(String.format( "Scrolling FullText Search documents entities: %s, input: %s, postFilters: %s, sortCriterion: %s, scrollId: %s, size: %s", entities, input, postFilters, sortCriterion, scrollId, size)); - return esSearchDAO.scroll(entities, input, postFilters, sortCriterion, scrollId, keepAlive, size, - new SearchFlags().setFulltext(false)); + SearchFlags flags = Optional.ofNullable(searchFlags).orElse(new SearchFlags()); + flags.setFulltext(false); + return esSearchDAO.scroll(entities, input, postFilters, sortCriterion, scrollId, keepAlive, size, flags); } public Optional raw(@Nonnull String indexName, @Nullable String jsonQuery) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java index 004b2e0a2adc4c..35cef71edd9538 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java @@ -5,6 +5,7 @@ import com.linkedin.metadata.models.SearchScoreFieldSpec; import com.linkedin.metadata.models.SearchableFieldSpec; import com.linkedin.metadata.models.annotation.SearchableAnnotation.FieldType; +import com.linkedin.metadata.search.utils.ESUtils; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -31,15 +32,6 @@ public static Map getPartialNgramConfigWithOverrides(Map KEYWORD_TYPE_MAP = ImmutableMap.of(TYPE, KEYWORD); - // Field Types - public static final String BOOLEAN = "boolean"; - public static final String DATE = "date"; - public static final String DOUBLE = "double"; - public static final String LONG = "long"; - public static final String OBJECT = "object"; - public static final String TEXT = "text"; - public static final String TOKEN_COUNT = "token_count"; - // Subfields public static final String DELIMITED = "delimited"; public static final String LENGTH = "length"; @@ -74,7 +66,7 @@ public static Map getMappings(@Nonnull final EntitySpec entitySp private static Map getMappingsForUrn() { Map subFields = new HashMap<>(); subFields.put(DELIMITED, ImmutableMap.of( - TYPE, TEXT, + TYPE, ESUtils.TEXT_FIELD_TYPE, ANALYZER, URN_ANALYZER, SEARCH_ANALYZER, URN_SEARCH_ANALYZER, SEARCH_QUOTE_ANALYZER, CUSTOM_QUOTE_ANALYZER) @@ -85,13 +77,13 @@ private static Map getMappingsForUrn() { ) )); return ImmutableMap.builder() - .put(TYPE, KEYWORD) + .put(TYPE, ESUtils.KEYWORD_FIELD_TYPE) .put(FIELDS, subFields) .build(); } private static Map getMappingsForRunId() { - return ImmutableMap.builder().put(TYPE, KEYWORD).build(); + return ImmutableMap.builder().put(TYPE, ESUtils.KEYWORD_FIELD_TYPE).build(); } private static Map getMappingsForField(@Nonnull final SearchableFieldSpec searchableFieldSpec) { @@ -104,23 +96,23 @@ private static Map getMappingsForField(@Nonnull final Searchable } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { mappingForField.putAll(getMappingsForSearchText(fieldType)); } else if (fieldType == FieldType.BROWSE_PATH) { - mappingForField.put(TYPE, TEXT); + mappingForField.put(TYPE, ESUtils.TEXT_FIELD_TYPE); mappingForField.put(FIELDS, ImmutableMap.of(LENGTH, ImmutableMap.of( - TYPE, TOKEN_COUNT, + TYPE, ESUtils.TOKEN_COUNT_FIELD_TYPE, ANALYZER, SLASH_PATTERN_ANALYZER))); mappingForField.put(ANALYZER, BROWSE_PATH_HIERARCHY_ANALYZER); mappingForField.put(FIELDDATA, true); } else if (fieldType == FieldType.BROWSE_PATH_V2) { - mappingForField.put(TYPE, TEXT); + mappingForField.put(TYPE, ESUtils.TEXT_FIELD_TYPE); mappingForField.put(FIELDS, ImmutableMap.of(LENGTH, ImmutableMap.of( - TYPE, TOKEN_COUNT, + TYPE, ESUtils.TOKEN_COUNT_FIELD_TYPE, ANALYZER, UNIT_SEPARATOR_PATTERN_ANALYZER))); mappingForField.put(ANALYZER, BROWSE_PATH_V2_HIERARCHY_ANALYZER); mappingForField.put(FIELDDATA, true); } else if (fieldType == FieldType.URN || fieldType == FieldType.URN_PARTIAL) { - mappingForField.put(TYPE, TEXT); + mappingForField.put(TYPE, ESUtils.TEXT_FIELD_TYPE); mappingForField.put(ANALYZER, URN_ANALYZER); mappingForField.put(SEARCH_ANALYZER, URN_SEARCH_ANALYZER); mappingForField.put(SEARCH_QUOTE_ANALYZER, CUSTOM_QUOTE_ANALYZER); @@ -135,13 +127,13 @@ private static Map getMappingsForField(@Nonnull final Searchable subFields.put(KEYWORD, KEYWORD_TYPE_MAP); mappingForField.put(FIELDS, subFields); } else if (fieldType == FieldType.BOOLEAN) { - mappingForField.put(TYPE, BOOLEAN); + mappingForField.put(TYPE, ESUtils.BOOLEAN_FIELD_TYPE); } else if (fieldType == FieldType.COUNT) { - mappingForField.put(TYPE, LONG); + mappingForField.put(TYPE, ESUtils.LONG_FIELD_TYPE); } else if (fieldType == FieldType.DATETIME) { - mappingForField.put(TYPE, DATE); + mappingForField.put(TYPE, ESUtils.DATE_FIELD_TYPE); } else if (fieldType == FieldType.OBJECT) { - mappingForField.put(TYPE, OBJECT); + mappingForField.put(TYPE, ESUtils.OBJECT_FIELD_TYPE); } else { log.info("FieldType {} has no mappings implemented", fieldType); } @@ -149,10 +141,10 @@ private static Map getMappingsForField(@Nonnull final Searchable searchableFieldSpec.getSearchableAnnotation() .getHasValuesFieldName() - .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, BOOLEAN))); + .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, ESUtils.BOOLEAN_FIELD_TYPE))); searchableFieldSpec.getSearchableAnnotation() .getNumValuesFieldName() - .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, LONG))); + .ifPresent(fieldName -> mappings.put(fieldName, ImmutableMap.of(TYPE, ESUtils.LONG_FIELD_TYPE))); mappings.putAll(getMappingsForFieldNameAliases(searchableFieldSpec)); return mappings; @@ -160,7 +152,7 @@ private static Map getMappingsForField(@Nonnull final Searchable private static Map getMappingsForKeyword() { Map mappingForField = new HashMap<>(); - mappingForField.put(TYPE, KEYWORD); + mappingForField.put(TYPE, ESUtils.KEYWORD_FIELD_TYPE); mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); // Add keyword subfield without lowercase filter mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP)); @@ -169,7 +161,7 @@ private static Map getMappingsForKeyword() { private static Map getMappingsForSearchText(FieldType fieldType) { Map mappingForField = new HashMap<>(); - mappingForField.put(TYPE, KEYWORD); + mappingForField.put(TYPE, ESUtils.KEYWORD_FIELD_TYPE); mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER); Map subFields = new HashMap<>(); if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) { @@ -186,14 +178,14 @@ private static Map getMappingsForSearchText(FieldType fieldType) String fieldName = entry.getKey(); String analyzerName = entry.getValue(); subFields.put(fieldName, ImmutableMap.of( - TYPE, TEXT, + TYPE, ESUtils.TEXT_FIELD_TYPE, ANALYZER, analyzerName )); } } } subFields.put(DELIMITED, ImmutableMap.of( - TYPE, TEXT, + TYPE, ESUtils.TEXT_FIELD_TYPE, ANALYZER, TEXT_ANALYZER, SEARCH_ANALYZER, TEXT_SEARCH_ANALYZER, SEARCH_QUOTE_ANALYZER, CUSTOM_QUOTE_ANALYZER)); @@ -206,7 +198,7 @@ private static Map getMappingsForSearchText(FieldType fieldType) private static Map getMappingsForSearchScoreField( @Nonnull final SearchScoreFieldSpec searchScoreFieldSpec) { return ImmutableMap.of(searchScoreFieldSpec.getSearchScoreAnnotation().getFieldName(), - ImmutableMap.of(TYPE, DOUBLE)); + ImmutableMap.of(TYPE, ESUtils.DOUBLE_FIELD_TYPE)); } private static Map getMappingsForFieldNameAliases(@Nonnull final SearchableFieldSpec searchableFieldSpec) { diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java index 5fcc10b7af5cfa..49571a60d5f211 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandler.java @@ -202,7 +202,7 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi if (!finalSearchFlags.isSkipHighlighting()) { searchSourceBuilder.highlighter(_highlights); } - ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion, _entitySpecs); if (finalSearchFlags.isGetSuggestions()) { ESUtils.buildNameSuggestions(searchSourceBuilder, input); @@ -242,8 +242,10 @@ public SearchRequest getSearchRequest(@Nonnull String input, @Nullable Filter fi BoolQueryBuilder filterQuery = getFilterQuery(filter); searchSourceBuilder.query(QueryBuilders.boolQuery().must(getQuery(input, finalSearchFlags.isFulltext())).filter(filterQuery)); _aggregationQueryBuilder.getAggregations().forEach(searchSourceBuilder::aggregation); - searchSourceBuilder.highlighter(getHighlights()); - ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + if (!finalSearchFlags.isSkipHighlighting()) { + searchSourceBuilder.highlighter(_highlights); + } + ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion, _entitySpecs); searchRequest.source(searchSourceBuilder); log.debug("Search request is: " + searchRequest); searchRequest.indicesOptions(null); @@ -270,7 +272,7 @@ public SearchRequest getFilterRequest(@Nullable Filter filters, @Nullable SortCr final SearchSourceBuilder searchSourceBuilder = new SearchSourceBuilder(); searchSourceBuilder.query(filterQuery); searchSourceBuilder.from(from).size(size); - ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion, _entitySpecs); searchRequest.source(searchSourceBuilder); return searchRequest; @@ -301,7 +303,7 @@ public SearchRequest getFilterRequest(@Nullable Filter filters, @Nullable SortCr searchSourceBuilder.size(size); ESUtils.setSearchAfter(searchSourceBuilder, sort, pitId, keepAlive); - ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion); + ESUtils.buildSortOrder(searchSourceBuilder, sortCriterion, _entitySpecs); searchRequest.source(searchSourceBuilder); return searchRequest; diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java index 9a7d9a1b4c4207..53765acb8e29e8 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/search/utils/ESUtils.java @@ -2,6 +2,9 @@ import com.google.common.collect.ImmutableList; import com.google.common.collect.ImmutableSet; +import com.linkedin.metadata.models.EntitySpec; +import com.linkedin.metadata.models.SearchableFieldSpec; +import com.linkedin.metadata.models.annotation.SearchableAnnotation; import com.linkedin.metadata.query.filter.Condition; import com.linkedin.metadata.query.filter.ConjunctiveCriterion; import com.linkedin.metadata.query.filter.Criterion; @@ -49,7 +52,28 @@ public class ESUtils { public static final int MAX_RESULT_SIZE = 10000; public static final String OPAQUE_ID_HEADER = "X-Opaque-Id"; public static final String HEADER_VALUE_DELIMITER = "|"; - public static final String KEYWORD_TYPE = "keyword"; + + // Field types + public static final String KEYWORD_FIELD_TYPE = "keyword"; + public static final String BOOLEAN_FIELD_TYPE = "boolean"; + public static final String DATE_FIELD_TYPE = "date"; + public static final String DOUBLE_FIELD_TYPE = "double"; + public static final String LONG_FIELD_TYPE = "long"; + public static final String OBJECT_FIELD_TYPE = "object"; + public static final String TEXT_FIELD_TYPE = "text"; + public static final String TOKEN_COUNT_FIELD_TYPE = "token_count"; + // End of field types + + public static final Set FIELD_TYPES_STORED_AS_KEYWORD = Set.of( + SearchableAnnotation.FieldType.KEYWORD, + SearchableAnnotation.FieldType.TEXT, + SearchableAnnotation.FieldType.TEXT_PARTIAL, + SearchableAnnotation.FieldType.WORD_GRAM); + public static final Set FIELD_TYPES_STORED_AS_TEXT = Set.of( + SearchableAnnotation.FieldType.BROWSE_PATH, + SearchableAnnotation.FieldType.BROWSE_PATH_V2, + SearchableAnnotation.FieldType.URN, + SearchableAnnotation.FieldType.URN_PARTIAL); public static final String ENTITY_NAME_FIELD = "_entityName"; public static final String NAME_SUGGESTION = "nameSuggestion"; @@ -174,6 +198,25 @@ public static QueryBuilder getQueryBuilderFromCriterion(@Nonnull final Criterion return getQueryBuilderFromCriterionForSingleField(criterion, isTimeseries); } + public static String getElasticTypeForFieldType(SearchableAnnotation.FieldType fieldType) { + if (FIELD_TYPES_STORED_AS_KEYWORD.contains(fieldType)) { + return KEYWORD_FIELD_TYPE; + } else if (FIELD_TYPES_STORED_AS_TEXT.contains(fieldType)) { + return TEXT_FIELD_TYPE; + } else if (fieldType == SearchableAnnotation.FieldType.BOOLEAN) { + return BOOLEAN_FIELD_TYPE; + } else if (fieldType == SearchableAnnotation.FieldType.COUNT) { + return LONG_FIELD_TYPE; + } else if (fieldType == SearchableAnnotation.FieldType.DATETIME) { + return DATE_FIELD_TYPE; + } else if (fieldType == SearchableAnnotation.FieldType.OBJECT) { + return OBJECT_FIELD_TYPE; + } else { + log.warn("FieldType {} has no mappings implemented", fieldType); + return null; + } + } + /** * Populates source field of search query with the sort order as per the criterion provided. * @@ -189,14 +232,39 @@ public static QueryBuilder getQueryBuilderFromCriterion(@Nonnull final Criterion * @param sortCriterion {@link SortCriterion} to be applied to the search results */ public static void buildSortOrder(@Nonnull SearchSourceBuilder searchSourceBuilder, - @Nullable SortCriterion sortCriterion) { + @Nullable SortCriterion sortCriterion, List entitySpecs) { if (sortCriterion == null) { searchSourceBuilder.sort(new ScoreSortBuilder().order(SortOrder.DESC)); } else { + Optional fieldTypeForDefault = Optional.empty(); + for (EntitySpec entitySpec : entitySpecs) { + List fieldSpecs = entitySpec.getSearchableFieldSpecs(); + for (SearchableFieldSpec fieldSpec : fieldSpecs) { + SearchableAnnotation annotation = fieldSpec.getSearchableAnnotation(); + if (annotation.getFieldName().equals(sortCriterion.getField()) + || annotation.getFieldNameAliases().contains(sortCriterion.getField())) { + fieldTypeForDefault = Optional.of(fieldSpec.getSearchableAnnotation().getFieldType()); + break; + } + } + if (fieldTypeForDefault.isPresent()) { + break; + } + } + if (fieldTypeForDefault.isEmpty()) { + log.warn("Sort criterion field " + sortCriterion.getField() + " was not found in any entity spec to be searched"); + } final SortOrder esSortOrder = (sortCriterion.getOrder() == com.linkedin.metadata.query.filter.SortOrder.ASCENDING) ? SortOrder.ASC : SortOrder.DESC; - searchSourceBuilder.sort(new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder).unmappedType(KEYWORD_TYPE)); + FieldSortBuilder sortBuilder = new FieldSortBuilder(sortCriterion.getField()).order(esSortOrder); + if (fieldTypeForDefault.isPresent()) { + String esFieldtype = getElasticTypeForFieldType(fieldTypeForDefault.get()); + if (esFieldtype != null) { + sortBuilder.unmappedType(esFieldtype); + } + } + searchSourceBuilder.sort(sortBuilder); } if (sortCriterion == null || !sortCriterion.getField().equals(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD)) { searchSourceBuilder.sort(new FieldSortBuilder(DEFAULT_SEARCH_RESULTS_SORT_BY_FIELD).order(SortOrder.ASC)); diff --git a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java index a496fc427138e9..3e8f83a531b591 100644 --- a/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java +++ b/metadata-io/src/main/java/com/linkedin/metadata/timeseries/elastic/ElasticSearchTimeseriesAspectService.java @@ -169,7 +169,7 @@ public List getIndexSizes() { List res = new ArrayList<>(); try { String indicesPattern = _indexConvention.getAllTimeseriesAspectIndicesPattern(); - Response r = _searchClient.getLowLevelClient().performRequest(new Request("GET", indicesPattern + "/_stats")); + Response r = _searchClient.getLowLevelClient().performRequest(new Request("GET", "/" + indicesPattern + "/_stats")); JsonNode body = new ObjectMapper().readTree(r.getEntity().getContent()); body.get("indices").fields().forEachRemaining(entry -> { TimeseriesIndexSizeResult elemResult = new TimeseriesIndexSizeResult(); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java index 461a146022446c..696e3b62834bdb 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/LineageServiceTestBase.java @@ -47,8 +47,10 @@ import com.linkedin.metadata.utils.elasticsearch.IndexConvention; import com.linkedin.metadata.utils.elasticsearch.IndexConventionImpl; import org.junit.Assert; +import org.mockito.ArgumentCaptor; import org.mockito.Mockito; import org.opensearch.client.RestHighLevelClient; +import org.opensearch.action.search.SearchRequest; import org.springframework.cache.CacheManager; import org.springframework.cache.concurrent.ConcurrentMapCacheManager; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; @@ -108,6 +110,7 @@ abstract public class LineageServiceTestBase extends AbstractTestNGSpringContext private GraphService _graphService; private CacheManager _cacheManager; private LineageSearchService _lineageSearchService; + private RestHighLevelClient _searchClientSpy; private static final String ENTITY_NAME = "testEntity"; private static final Urn TEST_URN = TestEntityUtil.getTestEntityUrn(); @@ -162,10 +165,11 @@ private ElasticSearchService buildEntitySearchService() { EntityIndexBuilders indexBuilders = new EntityIndexBuilders(getIndexBuilder(), _entityRegistry, _indexConvention, _settingsBuilder); - ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, getSearchClient(), _indexConvention, false, + _searchClientSpy = spy(getSearchClient()); + ESSearchDAO searchDAO = new ESSearchDAO(_entityRegistry, _searchClientSpy, _indexConvention, false, ELASTICSEARCH_IMPLEMENTATION_ELASTICSEARCH, getSearchConfiguration(), null); - ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, getSearchClient(), _indexConvention, getSearchConfiguration(), getCustomSearchConfiguration()); - ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, getSearchClient(), _indexConvention, getBulkProcessor(), 1); + ESBrowseDAO browseDAO = new ESBrowseDAO(_entityRegistry, _searchClientSpy, _indexConvention, getSearchConfiguration(), getCustomSearchConfiguration()); + ESWriteDAO writeDAO = new ESWriteDAO(_entityRegistry, _searchClientSpy, _indexConvention, getBulkProcessor(), 1); return new ElasticSearchService(indexBuilders, searchDAO, browseDAO, writeDAO); } @@ -246,9 +250,15 @@ public void testSearchService() throws Exception { _elasticSearchService.upsertDocument(ENTITY_NAME, document2.toString(), urn2.toString()); syncAfterWrite(getBulkProcessor()); + Mockito.reset(_searchClientSpy); searchResult = searchAcrossLineage(null, TEST1); assertEquals(searchResult.getNumEntities().intValue(), 1); assertEquals(searchResult.getEntities().get(0).getEntity(), urn); + // Verify that highlighting was turned off in the query + ArgumentCaptor searchRequestCaptor = ArgumentCaptor.forClass(SearchRequest.class); + Mockito.verify(_searchClientSpy, times(1)).search(searchRequestCaptor.capture(), any()); + SearchRequest capturedRequest = searchRequestCaptor.getValue(); + assertNull(capturedRequest.source().highlighter()); clearCache(false); when(_graphService.getLineage(eq(TEST_URN), eq(LineageDirection.DOWNSTREAM), anyInt(), anyInt(), diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java index 16605048102965..69dd5c80bef1d1 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/fixtures/SampleDataFixtureTestBase.java @@ -22,12 +22,15 @@ import com.linkedin.metadata.query.filter.Criterion; import com.linkedin.metadata.query.filter.CriterionArray; import com.linkedin.metadata.query.filter.Filter; +import com.linkedin.metadata.query.filter.SortCriterion; +import com.linkedin.metadata.query.filter.SortOrder; import com.linkedin.metadata.search.AggregationMetadata; import com.linkedin.metadata.search.ScrollResult; import com.linkedin.metadata.search.SearchEntity; import com.linkedin.metadata.search.SearchResult; import com.linkedin.metadata.search.SearchService; import com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig; +import com.linkedin.metadata.search.utils.ESUtils; import com.linkedin.r2.RemoteInvocationException; import org.junit.Assert; import org.opensearch.client.RequestOptions; @@ -36,6 +39,9 @@ import org.opensearch.client.indices.AnalyzeResponse; import org.opensearch.client.indices.GetMappingsRequest; import org.opensearch.client.indices.GetMappingsResponse; +import org.opensearch.search.builder.SearchSourceBuilder; +import org.opensearch.search.sort.FieldSortBuilder; +import org.opensearch.search.sort.SortBuilder; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.Test; @@ -54,11 +60,7 @@ import static com.linkedin.metadata.Constants.DATA_JOB_ENTITY_NAME; import static com.linkedin.metadata.search.elasticsearch.query.request.SearchQueryBuilder.STRUCTURED_QUERY_PREFIX; import static com.linkedin.metadata.utils.SearchUtil.AGGREGATION_SEPARATOR_CHAR; -import static io.datahubproject.test.search.SearchTestUtils.autocomplete; -import static io.datahubproject.test.search.SearchTestUtils.scroll; -import static io.datahubproject.test.search.SearchTestUtils.search; -import static io.datahubproject.test.search.SearchTestUtils.searchAcrossEntities; -import static io.datahubproject.test.search.SearchTestUtils.searchStructured; +import static io.datahubproject.test.search.SearchTestUtils.*; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertNotNull; @@ -174,6 +176,48 @@ public void testSearchFieldConfig() throws IOException { } } + @Test + public void testGetSortOrder() { + String dateFieldName = "lastOperationTime"; + List entityNamesToTestSearch = List.of("dataset", "chart", "corpgroup"); + List entitySpecs = entityNamesToTestSearch.stream().map( + name -> getEntityRegistry().getEntitySpec(name)) + .collect(Collectors.toList()); + SearchSourceBuilder builder = new SearchSourceBuilder(); + SortCriterion sortCriterion = new SortCriterion().setOrder(SortOrder.DESCENDING).setField(dateFieldName); + ESUtils.buildSortOrder(builder, sortCriterion, entitySpecs); + List> sorts = builder.sorts(); + assertEquals(sorts.size(), 2); // sort by last modified and then by urn + for (SortBuilder sort : sorts) { + assertTrue(sort instanceof FieldSortBuilder); + FieldSortBuilder fieldSortBuilder = (FieldSortBuilder) sort; + if (fieldSortBuilder.getFieldName().equals(dateFieldName)) { + assertEquals(fieldSortBuilder.order(), org.opensearch.search.sort.SortOrder.DESC); + assertEquals(fieldSortBuilder.unmappedType(), "date"); + } else { + assertEquals(fieldSortBuilder.getFieldName(), "urn"); + } + } + + // Test alias field + String entityNameField = "_entityName"; + SearchSourceBuilder nameBuilder = new SearchSourceBuilder(); + SortCriterion nameCriterion = new SortCriterion().setOrder(SortOrder.ASCENDING).setField(entityNameField); + ESUtils.buildSortOrder(nameBuilder, nameCriterion, entitySpecs); + sorts = nameBuilder.sorts(); + assertEquals(sorts.size(), 2); + for (SortBuilder sort : sorts) { + assertTrue(sort instanceof FieldSortBuilder); + FieldSortBuilder fieldSortBuilder = (FieldSortBuilder) sort; + if (fieldSortBuilder.getFieldName().equals(entityNameField)) { + assertEquals(fieldSortBuilder.order(), org.opensearch.search.sort.SortOrder.ASC); + assertEquals(fieldSortBuilder.unmappedType(), "keyword"); + } else { + assertEquals(fieldSortBuilder.getFieldName(), "urn"); + } + } + } + @Test public void testDatasetHasTags() throws IOException { GetMappingsRequest req = new GetMappingsRequest() @@ -1454,6 +1498,16 @@ public void testColumnExactMatch() { "Expected table with column name exact match first"); } + @Test + public void testSortOrdering() { + String query = "unit_data"; + SortCriterion criterion = new SortCriterion().setOrder(SortOrder.ASCENDING).setField("lastOperationTime"); + SearchResult result = getSearchService().searchAcrossEntities(SEARCHABLE_ENTITIES, query, null, criterion, 0, + 100, new SearchFlags().setFulltext(true).setSkipCache(true), null); + assertTrue(result.getEntities().size() > 2, + String.format("%s - Expected search results to have at least two results", query)); + } + private Stream getTokens(AnalyzeRequest request) throws IOException { return getSearchClient().indices().analyze(request, RequestOptions.DEFAULT).getTokens().stream(); } diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java index 90c6c523c588ff..0ea035a10f91da 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/search/query/request/SearchRequestHandlerTest.java @@ -97,6 +97,30 @@ public void testDatasetFieldsAndHighlights() { ), "unexpected lineage fields in highlights: " + highlightFields); } + @Test + public void testSearchRequestHandlerHighlightingTurnedOff() { + SearchRequestHandler requestHandler = SearchRequestHandler.getBuilder(TestEntitySpecBuilder.getSpec(), testQueryConfig, null); + SearchRequest searchRequest = requestHandler.getSearchRequest("testQuery", null, null, 0, + 10, new SearchFlags().setFulltext(false).setSkipHighlighting(true), null); + SearchSourceBuilder sourceBuilder = searchRequest.source(); + assertEquals(sourceBuilder.from(), 0); + assertEquals(sourceBuilder.size(), 10); + // Filters + Collection aggBuilders = sourceBuilder.aggregations().getAggregatorFactories(); + // Expect 2 aggregations: textFieldOverride and _index + assertEquals(aggBuilders.size(), 2); + for (AggregationBuilder aggBuilder : aggBuilders) { + if (aggBuilder.getName().equals("textFieldOverride")) { + TermsAggregationBuilder filterPanelBuilder = (TermsAggregationBuilder) aggBuilder; + assertEquals(filterPanelBuilder.field(), "textFieldOverride.keyword"); + } else if (!aggBuilder.getName().equals("_entityType")) { + fail("Found unexepected aggregation: " + aggBuilder.getName()); + } + } + // Highlights should not be present + assertNull(sourceBuilder.highlighter()); + } + @Test public void testSearchRequestHandler() { SearchRequestHandler requestHandler = SearchRequestHandler.getBuilder(TestEntitySpecBuilder.getSpec(), testQueryConfig, null); diff --git a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java index cc60ba8679e1f0..1362a0f69eff2d 100644 --- a/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java +++ b/metadata-io/src/test/java/com/linkedin/metadata/timeseries/search/TimeseriesAspectServiceTestBase.java @@ -45,6 +45,7 @@ import com.linkedin.timeseries.GroupingBucket; import com.linkedin.timeseries.GroupingBucketType; import com.linkedin.timeseries.TimeWindowSize; +import com.linkedin.timeseries.TimeseriesIndexSizeResult; import org.opensearch.client.RestHighLevelClient; import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; import org.testng.annotations.BeforeClass; @@ -884,4 +885,23 @@ public void testCountByFilterAfterDelete() throws InterruptedException { _elasticSearchTimeseriesAspectService.countByFilter(ENTITY_NAME, ASPECT_NAME, urnAndTimeFilter); assertEquals(count, 0L); } + + @Test(groups = {"getAggregatedStats"}, dependsOnGroups = {"upsert"}) + public void testGetIndexSizes() { + List result = _elasticSearchTimeseriesAspectService.getIndexSizes(); + //CHECKSTYLE:OFF + /* + Example result: + {aspectName=testentityprofile, sizeMb=52.234, + indexName=es_timeseries_aspect_service_test_testentity_testentityprofileaspect_v1, entityName=testentity} + {aspectName=testentityprofile, sizeMb=0.208, + indexName=es_timeseries_aspect_service_test_testentitywithouttests_testentityprofileaspect_v1, entityName=testentitywithouttests} + */ + // There may be other indices in there from other tests, so just make sure that index for entity + aspect is in there + //CHECKSTYLE:ON + assertTrue(result.size() > 0); + assertTrue( + result.stream().anyMatch(idxSizeResult -> idxSizeResult.getIndexName().equals( + "es_timeseries_aspect_service_test_testentity_testentityprofileaspect_v1"))); + } } diff --git a/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java index 67e1ee368f5136..34aa6978f742fc 100644 --- a/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java +++ b/metadata-io/src/test/java/io/datahubproject/test/search/SearchTestContainer.java @@ -5,7 +5,9 @@ import java.time.Duration; public interface SearchTestContainer { - String SEARCH_JAVA_OPTS = "-Xms64m -Xmx384m -XX:MaxDirectMemorySize=368435456"; + + String SEARCH_JAVA_OPTS = "-Xms446m -Xmx446m -XX:MaxDirectMemorySize=368435456"; + Duration STARTUP_TIMEOUT = Duration.ofMinutes(5); // usually < 1min GenericContainer startContainer(); diff --git a/metadata-jobs/mae-consumer/build.gradle b/metadata-jobs/mae-consumer/build.gradle index d36fd0de40d035..fcb8b62e4ac9d5 100644 --- a/metadata-jobs/mae-consumer/build.gradle +++ b/metadata-jobs/mae-consumer/build.gradle @@ -21,9 +21,9 @@ dependencies { implementation project(':ingestion-scheduler') implementation project(':metadata-utils') implementation project(":entity-registry") - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation project(':datahub-graphql-core') implementation externalDependency.elasticSearchRest diff --git a/metadata-jobs/mce-consumer/build.gradle b/metadata-jobs/mce-consumer/build.gradle index 0bca55e0e5f92d..97eec9fcff051c 100644 --- a/metadata-jobs/mce-consumer/build.gradle +++ b/metadata-jobs/mce-consumer/build.gradle @@ -17,9 +17,9 @@ dependencies { } implementation project(':metadata-utils') implementation project(':metadata-events:mxe-schemas') - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation project(':metadata-io') implementation project(':metadata-service:restli-client') implementation spec.product.pegasus.restliClient diff --git a/metadata-jobs/pe-consumer/build.gradle b/metadata-jobs/pe-consumer/build.gradle index 1899a4de15635a..81e8b8c9971f00 100644 --- a/metadata-jobs/pe-consumer/build.gradle +++ b/metadata-jobs/pe-consumer/build.gradle @@ -10,9 +10,9 @@ configurations { dependencies { avro project(path: ':metadata-models', configuration: 'avroSchema') implementation project(':li-utils') - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation(project(':metadata-service:factories')) { exclude group: 'org.neo4j.test' } diff --git a/metadata-models/build.gradle b/metadata-models/build.gradle index 53e7765152aefe..bd8052283e168f 100644 --- a/metadata-models/build.gradle +++ b/metadata-models/build.gradle @@ -23,6 +23,7 @@ dependencies { } } api project(':li-utils') + api project(path: ':li-utils', configuration: "dataTemplate") dataModel project(':li-utils') compileOnly externalDependency.lombok diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionAction.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionAction.pdl new file mode 100644 index 00000000000000..df6620b66bfd8f --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionAction.pdl @@ -0,0 +1,22 @@ +namespace com.linkedin.assertion + +/** + * The Actions about an Assertion. + * In the future, we'll likely extend this model to support additional + * parameters or options related to the assertion actions. + */ +record AssertionAction { + /** + * The type of the Action + */ + type: enum AssertionActionType { + /** + * Raise an incident. + */ + RAISE_INCIDENT + /** + * Resolve open incidents related to the assertion. + */ + RESOLVE_INCIDENT + } +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionActions.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionActions.pdl new file mode 100644 index 00000000000000..61846c1ba9c12d --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionActions.pdl @@ -0,0 +1,18 @@ +namespace com.linkedin.assertion + +/** + * The Actions about an Assertion + */ +@Aspect = { + "name": "assertionActions" +} +record AssertionActions { + /** + * Actions to be executed on successful assertion run. + */ + onSuccess: array[AssertionAction] = [] + /** + * Actions to be executed on failed assertion run. + */ + onFailure: array[AssertionAction] = [] +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl index 77ee147a781e20..e161270145a88d 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionInfo.pdl @@ -13,13 +13,73 @@ record AssertionInfo includes CustomProperties, ExternalReference { /** * Type of assertion. Assertion types can evolve to span Datasets, Flows (Pipelines), Models, Features etc. */ + @Searchable = { } type: enum AssertionType { - // A single-dataset assertion. When this is the value, the datasetAssertion field will be populated. + /** + * A single-dataset assertion. When this is the value, the datasetAssertion field will be populated. + */ DATASET + + /** + * A freshness assertion, or an assertion which indicates when a particular operation should occur + * to an asset. + */ + FRESHNESS + + /** + * A volume assertion, or an assertion which indicates how much data should be available for a + * particular asset. + */ + VOLUME + + /** + * A raw SQL-statement based assertion + */ + SQL + + /** + * A schema or structural assertion. + * + * Would have named this SCHEMA but the codegen for PDL does not allow this (reserved word). + */ + DATA_SCHEMA } /** - * Dataset Assertion information when type is DATASET + * A Dataset Assertion definition. This field is populated when the type is DATASET. */ datasetAssertion: optional DatasetAssertionInfo + + /** + * An Freshness Assertion definition. This field is populated when the type is FRESHNESS. + */ + freshnessAssertion: optional FreshnessAssertionInfo + + /** + * An Volume Assertion definition. This field is populated when the type is VOLUME. + */ + volumeAssertion: optional VolumeAssertionInfo + + /** + * A SQL Assertion definition. This field is populated when the type is SQL. + */ + sqlAssertion: optional SqlAssertionInfo + + /** + * An schema Assertion definition. This field is populated when the type is DATA_SCHEMA + */ + schemaAssertion: optional SchemaAssertionInfo + + /** + * The source or origin of the Assertion definition. + * + * If the source type of the Assertion is EXTERNAL, it is expected to have a corresponding dataPlatformInstance aspect detailing + * the platform where it was ingested from. + */ + source: optional AssertionSource + + /** + * An optional human-readable description of the assertion + */ + description: optional string } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl index decbfc08263de4..ded84e1969153b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResult.pdl @@ -5,10 +5,15 @@ namespace com.linkedin.assertion */ record AssertionResult { /** - * The final result, e.g. either SUCCESS or FAILURE. + * The final result, e.g. either SUCCESS, FAILURE, or ERROR. */ @TimeseriesField = {} + @Searchable = {} type: enum AssertionResultType { + /** + * The Assertion has not yet been fully evaluated + */ + INIT /** * The Assertion Succeeded */ @@ -17,6 +22,10 @@ record AssertionResult { * The Assertion Failed */ FAILURE + /** + * The Assertion encountered an Error + */ + ERROR } /** @@ -45,8 +54,13 @@ record AssertionResult { nativeResults: optional map[string, string] /** - * URL where full results are available + * External URL where full results are available. Only present when assertion source is not native. */ externalUrl: optional string + /** + * The error object if AssertionResultType is an Error + */ + error: optional AssertionResultError + } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl new file mode 100644 index 00000000000000..e768fe8521942f --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionResultError.pdl @@ -0,0 +1,45 @@ +namespace com.linkedin.assertion + +/** + * An error encountered when evaluating an AssertionResult + */ +record AssertionResultError { + /** + * The type of error encountered + */ + type: enum AssertionResultErrorType { + /** + * Source is unreachable + */ + SOURCE_CONNECTION_ERROR + /** + * Source query failed to execute + */ + SOURCE_QUERY_FAILED + /** + * Insufficient data to evaluate the assertion + */ + INSUFFICIENT_DATA + /** + * Invalid parameters were detected + */ + INVALID_PARAMETERS + /** + * Event type not supported by the specified source + */ + INVALID_SOURCE_TYPE + /** + * Unsupported platform + */ + UNSUPPORTED_PLATFORM + /** + * Unknown error + */ + UNKNOWN_ERROR + } + + /** + * Additional metadata depending on the type of error + */ + properties: optional map[string, string] +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl index 9e75f96fafd065..14f12042327404 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionRunEvent.pdl @@ -1,6 +1,7 @@ namespace com.linkedin.assertion -import com.linkedin.timeseries.TimeseriesAspectBase +import com.linkedin.timeseries.PartitionSpec +import com.linkedin.timeseries.TimeWindowSize import com.linkedin.common.ExternalReference import com.linkedin.common.Urn @@ -12,36 +13,31 @@ import com.linkedin.common.Urn "name": "assertionRunEvent", "type": "timeseries", } -record AssertionRunEvent includes TimeseriesAspectBase { +record AssertionRunEvent { + + /** + * The event timestamp field as epoch at UTC in milli seconds. + */ + @Searchable = { + "fieldName": "lastCompletedTime", + "fieldType": "DATETIME" + } + timestampMillis: long /** * Native (platform-specific) identifier for this run */ - //Multiple assertions could occur in same evaluator run runId: string - /* - * Urn of assertion which is evaluated - */ - @TimeseriesField = {} - assertionUrn: Urn - /* * Urn of entity on which the assertion is applicable */ - //example - dataset urn, if dataset is being asserted @TimeseriesField = {} asserteeUrn: Urn - - /** - * Specification of the batch which this run is evaluating - */ - batchSpec: optional BatchSpec /** * The status of the assertion run as per this timeseries event. */ - // Currently just supports COMPLETE, but should evolve to support other statuses like STARTED, RUNNING, etc. @TimeseriesField = {} status: enum AssertionRunStatus { /** @@ -59,4 +55,33 @@ record AssertionRunEvent includes TimeseriesAspectBase { * Runtime parameters of evaluation */ runtimeContext: optional map[string, string] + + /** + * Specification of the batch which this run is evaluating + */ + batchSpec: optional BatchSpec + + /* + * Urn of assertion which is evaluated + */ + @TimeseriesField = {} + assertionUrn: Urn + + /** + * Granularity of the event if applicable + */ + eventGranularity: optional TimeWindowSize + + /** + * The optional partition specification. + */ + partitionSpec: optional PartitionSpec = { + "type":"FULL_TABLE", + "partition":"FULL_TABLE_SNAPSHOT" + } + + /** + * The optional messageId, if provided serves as a custom user-defined unique identifier for an aspect value. + */ + messageId: optional string } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl new file mode 100644 index 00000000000000..d8892c0c71c6f6 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionSource.pdl @@ -0,0 +1,27 @@ +namespace com.linkedin.assertion + +/** + * The source of an assertion + */ +record AssertionSource { + /** + * The type of the Assertion Source + */ + @Searchable = { + "fieldName": "sourceType" + } + type: enum AssertionSourceType { + /** + * The assertion was defined natively on DataHub by a user. + */ + NATIVE + /** + * The assertion was defined and managed externally of DataHub. + */ + EXTERNAL + /** + * The assertion was inferred, e.g. from offline AI / ML models. + */ + INFERRED + } +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdAggregation.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdAggregation.pdl index b79b96f9379b03..968944165a1c81 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdAggregation.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionStdAggregation.pdl @@ -4,6 +4,7 @@ namespace com.linkedin.assertion * The function that is applied to the aggregation input (schema, rows, column values) before evaluating an operator. */ enum AssertionStdAggregation { + /** * Assertion is applied on number of rows. */ @@ -20,7 +21,7 @@ enum AssertionStdAggregation { COLUMN_COUNT /** - * Assertion is applied on individual column value. + * Assertion is applied on individual column value. (No aggregation) */ IDENTITY @@ -42,6 +43,13 @@ enum AssertionStdAggregation { /** * Assertion is applied on proportion of distinct values in column */ + UNIQUE_PROPORTION + + /** + * Assertion is applied on proportion of distinct values in column + * + * Deprecated! Use UNIQUE_PROPORTION instead. + */ UNIQUE_PROPOTION /** diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionValueChangeType.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionValueChangeType.pdl new file mode 100644 index 00000000000000..5a1ff4fa73ffbb --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AssertionValueChangeType.pdl @@ -0,0 +1,16 @@ +namespace com.linkedin.assertion + +/** +* An enum to represent a type of change in an assertion value, metric, or measurement. +*/ +enum AssertionValueChangeType { + /** + * A change that is defined in absolute terms. + */ + ABSOLUTE + /** + * A change that is defined in relative terms using percentage change + * from the original value. + */ + PERCENTAGE +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/AuditLogSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/AuditLogSpec.pdl new file mode 100644 index 00000000000000..4d5bf261cbf89b --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/AuditLogSpec.pdl @@ -0,0 +1,18 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldDataType + +/** +* Information about the Audit Log operation to use in evaluating an assertion. +**/ +record AuditLogSpec { + /** + * The list of operation types that should be monitored. If not provided, a default set will be used. + */ + operationTypes: optional array [string] + + /** + * Optional: The user name associated with the operation. + */ + userName: optional string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/DatasetAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/DatasetAssertionInfo.pdl index c411c7ff8a5721..2a8bf28f1ff11a 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/assertion/DatasetAssertionInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/DatasetAssertionInfo.pdl @@ -18,9 +18,10 @@ record DatasetAssertionInfo { /** * Scope of the Assertion. What part of the dataset does this assertion apply to? **/ + @Searchable = {} scope: enum DatasetAssertionScope { /** - * This assertion applies to dataset columns + * This assertion applies to dataset column(s) */ DATASET_COLUMN @@ -29,6 +30,11 @@ record DatasetAssertionInfo { */ DATASET_ROWS + /** + * This assertion applies to the storage size of the dataset + */ + DATASET_STORAGE_SIZE + /** * This assertion applies to the schema of the dataset */ @@ -41,7 +47,9 @@ record DatasetAssertionInfo { } /** - * One or more dataset schema fields that are targeted by this assertion + * One or more dataset schema fields that are targeted by this assertion. + * + * This field is expected to be provided if the assertion scope is DATASET_COLUMN. */ @Relationship = { "/*": { @@ -49,11 +57,18 @@ record DatasetAssertionInfo { "entityTypes": [ "schemaField" ] } } + @Searchable = { + "/*": { + "fieldType": "URN" + } + } fields: optional array[Urn] /** * Standardized assertion operator + * This field is left blank if there is no selected aggregation or metric for a particular column. */ + @Searchable = {} aggregation: optional AssertionStdAggregation /** diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FixedIntervalSchedule.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FixedIntervalSchedule.pdl new file mode 100644 index 00000000000000..c08c33ffb92d32 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FixedIntervalSchedule.pdl @@ -0,0 +1,10 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.timeseries.TimeWindowSize + +/** +* Attributes defining a relative fixed interval SLA schedule. +*/ +record FixedIntervalSchedule includes TimeWindowSize { +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionInfo.pdl new file mode 100644 index 00000000000000..4445a11ff40a74 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionInfo.pdl @@ -0,0 +1,53 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.dataset.DatasetFilter + +/** +* Attributes defining a Freshness Assertion. +**/ +record FreshnessAssertionInfo { + /** + * The type of the freshness assertion being monitored. + */ + @Searchable = {} + type: enum FreshnessAssertionType { + /** + * An Freshness based on Operations performed on a particular Dataset (insert, update, delete, etc) and sourced from an audit log, as + * opposed to based on the highest watermark in a timestamp column (e.g. a query). Only valid when entity is of type "dataset". + */ + DATASET_CHANGE + /** + * An Freshness based on a successful execution of a Data Job. + */ + DATA_JOB_RUN + } + + /** + * The entity targeted by this Freshness check. + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset", "dataJob" ] + } + entity: Urn + + /** + * Produce FAILURE Assertion Result if the asset is not updated on the cadence and within the time range described by the schedule. + */ + @Searchable = { + "/type": { + "fieldName": "scheduleType" + } + } + schedule: FreshnessAssertionSchedule + + /** + * A definition of the specific filters that should be applied, when performing monitoring. + * If not provided, there is no filter, and the full table is under consideration. + */ + filter: optional DatasetFilter +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionSchedule.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionSchedule.pdl new file mode 100644 index 00000000000000..a87342ad4f5edd --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessAssertionSchedule.pdl @@ -0,0 +1,66 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn + +/** +* Attributes defining a single Freshness schedule. +*/ +record FreshnessAssertionSchedule { + + /** + * The type of a Freshness Assertion Schedule. + * + * Once we support data-time-relative schedules (e.g. schedules relative to time partitions), + * we will add those schedule types here. + */ + type: enum FreshnessAssertionScheduleType { + /** + * An highly configurable recurring schedule which describes the times of events described + * by a CRON schedule, with the evaluation schedule assuming to be matching the cron schedule. + * + * In a CRON schedule type, we compute the look-back window to be the time between the last scheduled event + * and the current event (evaluation time). This means that the evaluation schedule must match exactly + * the schedule defined inside the cron schedule. + * + * For example, a CRON schedule defined as "0 8 * * *" would represent a schedule of "every day by 8am". Assuming + * that the assertion evaluation schedule is defined to match this, the freshness assertion would be evaluated in the following way: + * + * 1. Compute the "last scheduled occurrence" of the event using the CRON schedule. For example, yesterday at 8am. + * 2. Compute the bounds of a time window between the "last scheduled occurrence" (yesterday at 8am) until the "current occurrence" (today at 8am) + * 3. Verify that the target event has occurred within the CRON-interval window. + * 4. If the target event has occurred within the time window, then assertion passes. + * 5. If the target event has not occurred within the time window, then the assertion fails. + * + */ + CRON + /** + * A fixed interval which is used to compute a look-back window for use when evaluating the assertion relative + * to the Evaluation Time of the Assertion. + * + * To compute the valid look-back window, we subtract the fixed interval from the evaluation time. Then, we verify + * that the target event has occurred within that window. + * + * For example, a fixed interval of "24h" would represent a schedule of "in the last 24 hours". + * The 24 hour interval is relative to the evaluation time of the assertion. For example if we schedule the assertion + * to be evaluated each hour, we'd compute the result as follows: + * + * 1. Subtract the fixed interval from the current time (Evaluation time) to compute the bounds of a fixed look-back window. + * 2. Verify that the target event has occurred within the CRON-interval window. + * 3. If the target event has occurred within the time window, then assertion passes. + * 4. If the target event has not occurred within the time window, then the assertion fails. + * + */ + FIXED_INTERVAL + } + + /** + * A cron schedule. This field is required when type is CRON. + */ + cron: optional FreshnessCronSchedule + + /** + * A fixed interval schedule. This field is required when type is FIXED_INTERVAL. + */ + fixedInterval: optional FixedIntervalSchedule + +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessCronSchedule.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessCronSchedule.pdl new file mode 100644 index 00000000000000..d48900690c51df --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessCronSchedule.pdl @@ -0,0 +1,25 @@ +namespace com.linkedin.assertion + +/** +* Attributes defining a CRON-formatted schedule used for defining a freshness assertion. +*/ +record FreshnessCronSchedule { + /** + * A cron-formatted execution interval, as a cron string, e.g. 1 * * * * + */ + cron: string + + /** + * Timezone in which the cron interval applies, e.g. America/Los Angeles + */ + timezone: string + + /** + * An optional offset in milliseconds to SUBTRACT from the timestamp generated by the cron schedule + * to generate the lower bounds of the "freshness window", or the window of time in which an event must have occurred in order for the Freshness check + * to be considering passing. + * + * If left empty, the start of the SLA window will be the _end_ of the previously evaluated Freshness window. + */ + windowStartOffsetMs: optional long +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldKind.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldKind.pdl new file mode 100644 index 00000000000000..7b25589e500da6 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldKind.pdl @@ -0,0 +1,17 @@ +namespace com.linkedin.assertion + +enum FreshnessFieldKind { + /** + * Determine that a change has occurred by inspecting an last modified field which + * represents the last time at which a row was changed. + */ + LAST_MODIFIED, + /** + * Determine that a change has occurred by inspecting a field which should be tracked as the + * "high watermark" for the table. This should be an ascending number or date field. + * + * If rows with this column have not been added since the previous check + * then the Freshness Assertion will fail. + */ + HIGH_WATERMARK +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl new file mode 100644 index 00000000000000..04acd1c71352de --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/FreshnessFieldSpec.pdl @@ -0,0 +1,14 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldSpec + + +/** +* Lightweight spec used for referencing a particular schema field. +**/ +record FreshnessFieldSpec includes SchemaFieldSpec { + /** + * The type of the field being used to verify the Freshness Assertion. + */ + kind: optional FreshnessFieldKind +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentFieldTransformer.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentFieldTransformer.pdl new file mode 100644 index 00000000000000..d1d3e7b23b666f --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentFieldTransformer.pdl @@ -0,0 +1,60 @@ +namespace com.linkedin.assertion + +/** +* The definition of the transformer function that should be applied to a given field / column value in a dataset +* in order to determine the segment or bucket that it belongs to, which in turn is used to evaluate +* volume assertions. +*/ +record IncrementingSegmentFieldTransformer { + /** + * A 'standard' transformer type. Note that not all source systems will support all operators. + */ + type: enum IncrementingSegmentFieldTransformerType { + /** + * Rounds a timestamp (in seconds) down to the start of the month. + */ + TIMESTAMP_MS_TO_MINUTE + + /** + * Rounds a timestamp (in milliseconds) down to the nearest hour. + */ + TIMESTAMP_MS_TO_HOUR + + /** + * Rounds a timestamp (in milliseconds) down to the start of the day. + */ + TIMESTAMP_MS_TO_DATE + + /** + * Rounds a timestamp (in milliseconds) down to the start of the month + */ + TIMESTAMP_MS_TO_MONTH + + /** + * Rounds a timestamp (in milliseconds) down to the start of the year + */ + TIMESTAMP_MS_TO_YEAR + + /** + * Rounds a numeric value down to the nearest integer. + */ + FLOOR + + /** + * Rounds a numeric value up to the nearest integer. + */ + CEILING + + /** + * A backdoor to provide a native operator type specific to a given source system like + * Snowflake, Redshift, BQ, etc. + */ + NATIVE + } + + /** + * The 'native' transformer type, useful as a back door if a custom operator is required. + * This field is required if the type is NATIVE. + */ + nativeType: optional string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountChange.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountChange.pdl new file mode 100644 index 00000000000000..7c4c73f2ea8879 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountChange.pdl @@ -0,0 +1,33 @@ +namespace com.linkedin.assertion + + +/** +* Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_CHANGE volume assertion. +*/ +record IncrementingSegmentRowCountChange { + /** + * A specification of how the 'segment' can be derived using a column and an optional transformer function. + */ + segment: IncrementingSegmentSpec + + /** + * The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + */ + type: AssertionValueChangeType + + /** + * The operator you'd like to apply to the row count value + * + * Note that only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountTotal.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountTotal.pdl new file mode 100644 index 00000000000000..6b035107aae090 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentRowCountTotal.pdl @@ -0,0 +1,27 @@ +namespace com.linkedin.assertion + +/** +* Attributes defining an INCREMENTING_SEGMENT_ROW_COUNT_TOTAL volume assertion. +*/ +record IncrementingSegmentRowCountTotal { + /** + * A specification of how the 'segment' can be derived using a column and an optional transformer function. + */ + segment: IncrementingSegmentSpec + + /** + * The operator you'd like to apply. + * + * Note that only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentSpec.pdl new file mode 100644 index 00000000000000..eddd0c3da3df72 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/IncrementingSegmentSpec.pdl @@ -0,0 +1,33 @@ +namespace com.linkedin.assertion + +import com.linkedin.schema.SchemaFieldSpec + +/** +* Core attributes required to identify an incrementing segment in a table. This type is mainly useful +* for tables that constantly increase with new rows being added on a particular cadence (e.g. fact or event tables) +* +* An incrementing segment represents a logical chunk of data which is INSERTED +* into a dataset on a regular interval, along with the presence of a constantly-incrementing column +* value such as an event time, date partition, or last modified column. +* +* An incrementing segment is principally identified by 2 key attributes combined: +* +* 1. A field or column that represents the incrementing value. New rows that are inserted will be identified using this column. +* Note that the value of this column may not by itself represent the "bucket" or the "segment" in which the row falls. +* +* 2. [Optional] An transformer function that may be applied to the selected column value in order +* to obtain the final "segment identifier" or "bucket identifier". Rows that have the same value after applying the transformation +* will be grouped into the same segment, using which the final value (e.g. row count) will be determined. +*/ +record IncrementingSegmentSpec { + /** + * The field to use to generate segments. It must be constantly incrementing as new rows are inserted. + */ + field: SchemaFieldSpec + + /** + * Optional transformer function to apply to the field in order to obtain the final segment or bucket identifier. + * If not provided, then no operator will be applied to the field. (identity function) + */ + transformer: optional IncrementingSegmentFieldTransformer +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountChange.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountChange.pdl new file mode 100644 index 00000000000000..85a915066f5845 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountChange.pdl @@ -0,0 +1,27 @@ +namespace com.linkedin.assertion + +/** +* Attributes defining a ROW_COUNT_CHANGE volume assertion. +*/ +record RowCountChange { + /** + * The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + */ + type: AssertionValueChangeType + + /** + * The operator you'd like to apply. + * + * Note that only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountTotal.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountTotal.pdl new file mode 100644 index 00000000000000..f691f15f62e042 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/RowCountTotal.pdl @@ -0,0 +1,22 @@ +namespace com.linkedin.assertion + +/** +* Attributes defining a ROW_COUNT_TOTAL volume assertion. +*/ +record RowCountTotal { + /** + * The operator you'd like to apply. + * + * Note that only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl new file mode 100644 index 00000000000000..fd246e0c7cfc46 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/SchemaAssertionInfo.pdl @@ -0,0 +1,29 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.schema.SchemaMetadata + +/** +* Attributes that are applicable to schema assertions +**/ +record SchemaAssertionInfo { + /** + * The entity targeted by the assertion + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset", "dataJob" ] + } + entity: Urn + + /** + * A definition of the expected structure for the asset + * + * Note that many of the fields of this model, especially those related to metadata (tags, terms) + * will go unused in this context. + */ + schema: SchemaMetadata +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/SqlAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/SqlAssertionInfo.pdl new file mode 100644 index 00000000000000..f6ce738252f35f --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/SqlAssertionInfo.pdl @@ -0,0 +1,67 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.dataset.DatasetFilter + +/** +* Attributes defining a SQL Assertion +*/ +record SqlAssertionInfo { + /** + * The type of the SQL assertion being monitored. + */ + @Searchable = {} + type: enum SqlAssertionType { + /** + * A SQL Metric Assertion, e.g. one based on a numeric value returned by an arbitrary SQL query. + */ + METRIC + /** + * A SQL assertion that is evaluated against the CHANGE in a metric assertion + * over time. + */ + METRIC_CHANGE + } + + /** + * The entity targeted by this SQL check. + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset" ] + } + entity: Urn + + /** + * The SQL statement to be executed when evaluating the assertion (or computing the metric). + * This should be a valid and complete statement, executable by itself. + * + * Usually this should be a SELECT query statement. + */ + statement: string + + /** + * The type of the value used to evaluate the assertion: a fixed absolute value or a relative percentage. + * This value is required if the type is METRIC_CHANGE. + */ + changeType: optional AssertionValueChangeType + + /** + * The operator you'd like to apply to the result of the SQL query. + * + * Note that at this time, only numeric operators are valid inputs: + * GREATER_THAN, GREATER_THAN_OR_EQUAL_TO, EQUAL_TO, LESS_THAN, LESS_THAN_OR_EQUAL_TO, + * BETWEEN. + */ + operator: AssertionStdOperator + + /** + * The parameters you'd like to provide as input to the operator. + * + * Note that only numeric parameter types are valid inputs: NUMBER. + */ + parameters: AssertionStdParameters +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl new file mode 100644 index 00000000000000..327b76f95762e3 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/assertion/VolumeAssertionInfo.pdl @@ -0,0 +1,82 @@ +namespace com.linkedin.assertion + +import com.linkedin.common.Urn +import com.linkedin.dataset.DatasetFilter + +/** +* Attributes defining a dataset Volume Assertion +*/ +record VolumeAssertionInfo { + /** + * The type of the freshness assertion being monitored. + */ + @Searchable = {} + type: enum VolumeAssertionType { + /** + * A volume assertion that is evaluated against the total row count of a dataset. + */ + ROW_COUNT_TOTAL + /** + * A volume assertion that is evaluated against an incremental row count of a dataset, + * or a row count change. + */ + ROW_COUNT_CHANGE + /** + * A volume assertion that checks the latest "segment" in a table based on an incrementing + * column to check whether it's row count falls into a particular range. + * + * This can be used to monitor the row count of an incrementing date-partition column segment. + */ + INCREMENTING_SEGMENT_ROW_COUNT_TOTAL + /** + * A volume assertion that compares the row counts in neighboring "segments" or "partitions" + * of an incrementing column. + * This can be used to track changes between subsequent date partition + * in a table, for example. + */ + INCREMENTING_SEGMENT_ROW_COUNT_CHANGE + } + + /** + * The entity targeted by this Volume check. + */ + @Searchable = { + "fieldType": "URN" + } + @Relationship = { + "name": "Asserts", + "entityTypes": [ "dataset" ] + } + entity: Urn + + /** + * Produce FAILURE Assertion Result if the row count of the asset does not meet specific requirements. + * Required if type is 'ROW_COUNT_TOTAL' + */ + rowCountTotal: optional RowCountTotal + + /** + * Produce FAILURE Assertion Result if the delta row count of the asset does not meet specific requirements + * within a given period of time. + * Required if type is 'ROW_COUNT_CHANGE' + */ + rowCountChange: optional RowCountChange + + /** + * Produce FAILURE Assertion Result if the asset's latest incrementing segment row count total + * does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_TOTAL' + */ + incrementingSegmentRowCountTotal: optional IncrementingSegmentRowCountTotal + + /** + * Produce FAILURE Assertion Result if the asset's incrementing segment row count delta + * does not meet specific requirements. Required if type is 'INCREMENTING_SEGMENT_ROW_COUNT_CHANGE' + */ + incrementingSegmentRowCountChange: optional IncrementingSegmentRowCountChange + + /** + * A definition of the specific filters that should be applied, when performing monitoring. + * If not provided, there is no filter, and the full table is under consideration. + */ + filter: optional DatasetFilter +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractProperties.pdl new file mode 100644 index 00000000000000..a623f585df30cd --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractProperties.pdl @@ -0,0 +1,59 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn + +/** + * Information about a data contract + */ +@Aspect = { + "name": "dataContractProperties" +} +record DataContractProperties { + /** + * The entity that this contract is associated with. Currently, we only support Dataset contracts, but + * in the future we may also support Data Product level contracts. + */ + @Relationship = { + "name": "ContractFor", + "entityTypes": [ "dataset" ] + } + entity: Urn + + /** + * An optional set of schema contracts. If this is a dataset contract, there will only be one. + */ + @Relationship = { + "/*/assertion": { + "name": "IncludesSchemaAssertion", + "entityTypes": [ "assertion" ] + } + } + schema: optional array[SchemaContract] + + /** + * An optional set of FRESHNESS contracts. If this is a dataset contract, there will only be one. + */ + @Relationship = { + "/*/assertion": { + "name": "IncludesFreshnessAssertion", + "entityTypes": [ "assertion" ] + } + } + freshness: optional array[FreshnessContract] + + /** + * An optional set of Data Quality contracts, e.g. table and column level contract constraints. + */ + @Relationship = { + "/*/assertion": { + "name": "IncludesDataQualityAssertion", + "entityTypes": [ "assertion" ] + } + } + dataQuality: optional array[DataQualityContract] + + /** + * YAML-formatted contract definition + */ + rawContract: optional string +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractStatus.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractStatus.pdl new file mode 100644 index 00000000000000..d61fb191ae53d2 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataContractStatus.pdl @@ -0,0 +1,27 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn +import com.linkedin.common.CustomProperties + +/** + * Information about the status of a data contract + */ +@Aspect = { + "name": "dataContractStatus" +} +record DataContractStatus includes CustomProperties { + /** + * The latest state of the data contract + */ + @Searchable = {} + state: enum DataContractState { + /** + * The data contract is active. + */ + ACTIVE + /** + * The data contract is pending implementation. + */ + PENDING + } +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl new file mode 100644 index 00000000000000..273d2c2a56f95b --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/DataQualityContract.pdl @@ -0,0 +1,16 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn + + +/** + * A data quality contract pertaining to a physical data asset + * Data Quality contracts are used to make assertions about data quality metrics for a physical data asset + */ +record DataQualityContract { + /** + * The assertion representing the Data Quality contract. + * E.g. a table or column-level assertion. + */ + assertion: Urn +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/FreshnessContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/FreshnessContract.pdl new file mode 100644 index 00000000000000..8cfa66846d505a --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/FreshnessContract.pdl @@ -0,0 +1,13 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn + +/** + * A contract pertaining to the operational SLAs of a physical data asset + */ +record FreshnessContract { + /** + * The assertion representing the SLA contract. + */ + assertion: Urn +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl b/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl new file mode 100644 index 00000000000000..6c11e0da5b1286 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/datacontract/SchemaContract.pdl @@ -0,0 +1,13 @@ +namespace com.linkedin.datacontract + +import com.linkedin.common.Urn + +/** + * Expectations for a logical schema + */ +record SchemaContract { + /** + * The assertion representing the schema contract. + */ + assertion: Urn +} diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetFilter.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetFilter.pdl new file mode 100644 index 00000000000000..6823398f79f3db --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetFilter.pdl @@ -0,0 +1,30 @@ +namespace com.linkedin.dataset + +/** + * A definition of filters that should be used when + * querying an external Dataset or Table. + * + * Note that this models should NOT be used for working with + * search / filter on DataHub Platform itself. + */ +record DatasetFilter { + /** + * How the partition will be represented in this model. + * + * In the future, we'll likely add support for more structured + * predicates. + */ + type: enum DatasetFilterType { + /** + * The partition is represented as a an opaque, raw SQL + * clause. + */ + SQL + } + + /** + * The raw where clause string which will be used for monitoring. + * Required if the type is SQL. + */ + sql: optional string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl index b3e237202fc2f4..f777b5d6e12e7b 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/ingestion/DataHubIngestionSourceInfo.pdl @@ -37,10 +37,10 @@ record DataHubIngestionSourceInfo { * Parameters associated with the Ingestion Source */ config: record DataHubIngestionSourceConfig { - /** - * The JSON recipe to use for ingestion - */ - recipe: string + /** + * The JSON recipe to use for ingestion + */ + recipe: string /** * The PyPI version of the datahub CLI to use when executing a recipe @@ -56,5 +56,10 @@ record DataHubIngestionSourceInfo { * Whether or not to run this ingestion source in debug mode */ debugMode: optional boolean + + /** + * Extra arguments for the ingestion run. + */ + extraArgs: optional map[string, string] } } \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataContractKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataContractKey.pdl new file mode 100644 index 00000000000000..f1d4a709cd6bfb --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataContractKey.pdl @@ -0,0 +1,14 @@ +namespace com.linkedin.metadata.key + +/** + * Key for a Data Contract + */ +@Aspect = { + "name": "dataContractKey" +} +record DataContractKey { + /** + * Unique id for the contract + */ + id: string +} \ No newline at end of file diff --git a/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaFieldSpec.pdl b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaFieldSpec.pdl new file mode 100644 index 00000000000000..e875ff7a844036 --- /dev/null +++ b/metadata-models/src/main/pegasus/com/linkedin/schema/SchemaFieldSpec.pdl @@ -0,0 +1,21 @@ +namespace com.linkedin.schema + +/** +* Lightweight spec used for referencing a particular schema field. +**/ +record SchemaFieldSpec { + /** + * The field path + */ + path: string + + /** + * The DataHub standard schema field type. + */ + type: string + + /** + * The native field type + */ + nativeType: string +} \ No newline at end of file diff --git a/metadata-models/src/main/resources/entity-registry.yml b/metadata-models/src/main/resources/entity-registry.yml index 56fc5f6568eb7e..11d0f74305d7be 100644 --- a/metadata-models/src/main/resources/entity-registry.yml +++ b/metadata-models/src/main/resources/entity-registry.yml @@ -262,6 +262,7 @@ entities: - assertionInfo - dataPlatformInstance - assertionRunEvent + - assertionActions - status - name: dataHubRetention category: internal @@ -457,4 +458,12 @@ entities: aspects: - ownershipTypeInfo - status + - name: dataContract + category: core + keyAspect: dataContractKey + aspects: + - dataContractProperties + - dataContractStatus + - status + events: diff --git a/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java b/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java index f9cf1b01e17626..d3c5ba822ac041 100644 --- a/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java +++ b/metadata-service/auth-config/src/main/java/com/datahub/authentication/AuthenticationConfiguration.java @@ -29,4 +29,6 @@ public class AuthenticationConfiguration { * The lifespan of a UI session token. */ private long sessionTokenDurationMs; + + private TokenServiceConfiguration tokenService; } diff --git a/metadata-service/auth-config/src/main/java/com/datahub/authentication/TokenServiceConfiguration.java b/metadata-service/auth-config/src/main/java/com/datahub/authentication/TokenServiceConfiguration.java new file mode 100644 index 00000000000000..0a606f0f06d920 --- /dev/null +++ b/metadata-service/auth-config/src/main/java/com/datahub/authentication/TokenServiceConfiguration.java @@ -0,0 +1,15 @@ +package com.datahub.authentication; + +import lombok.Data; + + +@Data +/** + * Configurations for DataHub token service + */ +public class TokenServiceConfiguration { + private String signingKey; + private String salt; + private String issuer; + private String signingAlgorithm; +} diff --git a/metadata-service/auth-filter/build.gradle b/metadata-service/auth-filter/build.gradle index 2dd07ef10274c5..61e9015adc9423 100644 --- a/metadata-service/auth-filter/build.gradle +++ b/metadata-service/auth-filter/build.gradle @@ -14,4 +14,6 @@ dependencies { annotationProcessor externalDependency.lombok testImplementation externalDependency.mockito + testImplementation externalDependency.testng + testImplementation externalDependency.springBootTest } \ No newline at end of file diff --git a/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthTestConfiguration.java b/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthTestConfiguration.java new file mode 100644 index 00000000000000..05ca428283a6ce --- /dev/null +++ b/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthTestConfiguration.java @@ -0,0 +1,79 @@ +package com.datahub.auth.authentication; + +import com.datahub.auth.authentication.filter.AuthenticationFilter; +import com.datahub.authentication.AuthenticationConfiguration; +import com.datahub.authentication.AuthenticatorConfiguration; +import com.datahub.authentication.TokenServiceConfiguration; +import com.datahub.authentication.token.StatefulTokenService; +import com.linkedin.gms.factory.config.ConfigurationProvider; +import com.linkedin.metadata.config.AuthPluginConfiguration; +import com.linkedin.metadata.config.DataHubConfiguration; +import com.linkedin.metadata.config.PluginConfiguration; +import com.linkedin.metadata.entity.EntityService; +import java.util.List; +import java.util.Map; +import javax.servlet.ServletException; +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.context.annotation.DependsOn; + +import static org.mockito.Mockito.*; + +@Configuration +public class AuthTestConfiguration { + + + @Bean + public EntityService entityService() { + return mock(EntityService.class); + } + + @Bean("dataHubTokenService") + public StatefulTokenService statefulTokenService(ConfigurationProvider configurationProvider, EntityService entityService) { + TokenServiceConfiguration tokenServiceConfiguration = configurationProvider.getAuthentication().getTokenService(); + return new StatefulTokenService( + tokenServiceConfiguration.getSigningKey(), + tokenServiceConfiguration.getSigningAlgorithm(), + tokenServiceConfiguration.getIssuer(), + entityService, + tokenServiceConfiguration.getSalt() + ); + } + + @Bean + public ConfigurationProvider configurationProvider() { + ConfigurationProvider configurationProvider = new ConfigurationProvider(); + AuthenticationConfiguration authenticationConfiguration = new AuthenticationConfiguration(); + authenticationConfiguration.setEnabled(true); + configurationProvider.setAuthentication(authenticationConfiguration); + DataHubConfiguration dataHubConfiguration = new DataHubConfiguration(); + PluginConfiguration pluginConfiguration = new PluginConfiguration(); + AuthPluginConfiguration authPluginConfiguration = new AuthPluginConfiguration(); + authenticationConfiguration.setSystemClientId("__datahub_system"); + authenticationConfiguration.setSystemClientSecret("JohnSnowKnowsNothing"); + TokenServiceConfiguration tokenServiceConfiguration = new TokenServiceConfiguration(); + tokenServiceConfiguration.setIssuer("datahub-metadata-service"); + tokenServiceConfiguration.setSigningKey("WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94="); + tokenServiceConfiguration.setSalt("ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O="); + tokenServiceConfiguration.setSigningAlgorithm("HS256"); + authenticationConfiguration.setTokenService(tokenServiceConfiguration); + AuthenticatorConfiguration authenticator = new AuthenticatorConfiguration(); + authenticator.setType("com.datahub.authentication.authenticator.DataHubTokenAuthenticator"); + authenticator.setConfigs(Map.of("signingKey", "WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94=", + "salt", "ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O=")); + List authenticators = List.of(authenticator); + authenticationConfiguration.setAuthenticators(authenticators); + authPluginConfiguration.setPath(""); + pluginConfiguration.setAuth(authPluginConfiguration); + dataHubConfiguration.setPlugin(pluginConfiguration); + configurationProvider.setDatahub(dataHubConfiguration); + return configurationProvider; + } + + @Bean + // TODO: Constructor injection + @DependsOn({"configurationProvider", "dataHubTokenService", "entityService"}) + public AuthenticationFilter authenticationFilter() throws ServletException { + return new AuthenticationFilter(); + } +} diff --git a/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthenticationFilterTest.java b/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthenticationFilterTest.java new file mode 100644 index 00000000000000..2ac65bf09c912e --- /dev/null +++ b/metadata-service/auth-filter/src/test/java/com/datahub/auth/authentication/AuthenticationFilterTest.java @@ -0,0 +1,53 @@ +package com.datahub.auth.authentication; + +import com.datahub.auth.authentication.filter.AuthenticationFilter; +import com.datahub.authentication.Actor; +import com.datahub.authentication.ActorType; +import com.datahub.authentication.token.StatefulTokenService; +import com.datahub.authentication.token.TokenException; +import java.io.IOException; +import java.util.Collections; +import java.util.List; +import javax.servlet.FilterChain; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.testng.AbstractTestNGSpringContextTests; +import org.testng.annotations.Test; + +import static com.datahub.authentication.AuthenticationConstants.*; +import static org.mockito.Mockito.*; + + +@ContextConfiguration(classes = { AuthTestConfiguration.class }) +public class AuthenticationFilterTest extends AbstractTestNGSpringContextTests { + + @Autowired + AuthenticationFilter _authenticationFilter; + + @Autowired + StatefulTokenService _statefulTokenService; + + @Test + public void testExpiredToken() throws ServletException, IOException, TokenException { + _authenticationFilter.init(null); + HttpServletRequest servletRequest = mock(HttpServletRequest.class); + HttpServletResponse servletResponse = mock(HttpServletResponse.class); + FilterChain filterChain = mock(FilterChain.class); + Actor actor = new Actor(ActorType.USER, "datahub"); +// String token = _statefulTokenService.generateAccessToken(TokenType.SESSION, actor, 0L, System.currentTimeMillis(), "token", +// "token", actor.toUrnStr()); + // Token generated 9/11/23, invalid for all future dates + String token = "eyJhbGciOiJIUzI1NiJ9.eyJhY3RvclR5cGUiOiJVU0VSIZCI6ImRhdGFodWIiLCJ0eXBlIjoiU0VTU0lPTiIsInZlcnNpb24iOiIxIiwian" + + "RpIjoiMmI0MzZkZDAtYjEwOS00N2UwLWJmYTEtMzM2ZmU4MTU4MDE1Iiwic3ViIjoiZGF0YWh1YiIsImV4cCI6MTY5NDU0NzA2OCwiaXNzIjoiZGF" + + "0YWh1Yi1tZXRhZGF0YS1zZXJ2aWNlIn0.giqx7J5a9mxuubG6rXdAMoaGlcII-fqY-W82Wm7OlLI"; + when(servletRequest.getHeaderNames()).thenReturn(Collections.enumeration(List.of(AUTHORIZATION_HEADER_NAME))); + when(servletRequest.getHeader(AUTHORIZATION_HEADER_NAME)) + .thenReturn("Bearer " + token); + + _authenticationFilter.doFilter(servletRequest, servletResponse, filterChain); + verify(servletResponse, times(1)).sendError(eq(HttpServletResponse.SC_UNAUTHORIZED), anyString()); + } +} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java index d62c37160f8165..f8eca541e1efb4 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/AuthorizerChain.java @@ -82,7 +82,7 @@ public AuthorizationResult authorize(@Nonnull final AuthorizationRequest request } @Override - public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { + public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { if (this.authorizers.isEmpty()) { return null; } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java index f653ccf72cf545..e30fb93109915a 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DataHubAuthorizer.java @@ -8,6 +8,8 @@ import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.authorization.PoliciesConfig; import com.linkedin.policy.DataHubPolicyInfo; + +import java.net.URISyntaxException; import java.util.ArrayList; import java.util.HashMap; import java.util.List; @@ -17,6 +19,8 @@ import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; +import java.util.concurrent.locks.ReadWriteLock; +import java.util.concurrent.locks.ReentrantReadWriteLock; import javax.annotation.Nonnull; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -51,11 +55,12 @@ public enum AuthorizationMode { // Maps privilege name to the associated set of policies for fast access. // Not concurrent data structure because writes are always against the entire thing. private final Map> _policyCache = new HashMap<>(); // Shared Policy Cache. + private final ReadWriteLock _lockPolicyCache = new ReentrantReadWriteLock(); private final ScheduledExecutorService _refreshExecutorService = Executors.newScheduledThreadPool(1); private final PolicyRefreshRunnable _policyRefreshRunnable; private final PolicyEngine _policyEngine; - private ResourceSpecResolver _resourceSpecResolver; + private EntitySpecResolver _entitySpecResolver; private AuthorizationMode _mode; public static final String ALL = "ALL"; @@ -69,14 +74,14 @@ public DataHubAuthorizer( _systemAuthentication = Objects.requireNonNull(systemAuthentication); _mode = Objects.requireNonNull(mode); _policyEngine = new PolicyEngine(systemAuthentication, Objects.requireNonNull(entityClient)); - _policyRefreshRunnable = new PolicyRefreshRunnable(systemAuthentication, new PolicyFetcher(entityClient), _policyCache); + _policyRefreshRunnable = new PolicyRefreshRunnable(systemAuthentication, new PolicyFetcher(entityClient), _policyCache, _lockPolicyCache); _refreshExecutorService.scheduleAtFixedRate(_policyRefreshRunnable, delayIntervalSeconds, refreshIntervalSeconds, TimeUnit.SECONDS); } @Override public void init(@Nonnull Map authorizerConfig, @Nonnull AuthorizerContext ctx) { // Pass. No static config. - _resourceSpecResolver = Objects.requireNonNull(ctx.getResourceSpecResolver()); + _entitySpecResolver = Objects.requireNonNull(ctx.getEntitySpecResolver()); } public AuthorizationResult authorize(@Nonnull final AuthorizationRequest request) { @@ -86,30 +91,43 @@ public AuthorizationResult authorize(@Nonnull final AuthorizationRequest request return new AuthorizationResult(request, AuthorizationResult.Type.ALLOW, null); } - Optional resolvedResourceSpec = request.getResourceSpec().map(_resourceSpecResolver::resolve); + Optional resolvedResourceSpec = request.getResourceSpec().map(_entitySpecResolver::resolve); - // 1. Fetch the policies relevant to the requested privilege. - final List policiesToEvaluate = _policyCache.getOrDefault(request.getPrivilege(), new ArrayList<>()); + _lockPolicyCache.readLock().lock(); + try { + // 1. Fetch the policies relevant to the requested privilege. + final List policiesToEvaluate = _policyCache.getOrDefault(request.getPrivilege(), new ArrayList<>()); - // 2. Evaluate each policy. - for (DataHubPolicyInfo policy : policiesToEvaluate) { - if (isRequestGranted(policy, request, resolvedResourceSpec)) { - // Short circuit if policy has granted privileges to this actor. - return new AuthorizationResult(request, AuthorizationResult.Type.ALLOW, - String.format("Granted by policy with type: %s", policy.getType())); + // 2. Evaluate each policy. + for (DataHubPolicyInfo policy : policiesToEvaluate) { + if (isRequestGranted(policy, request, resolvedResourceSpec)) { + // Short circuit if policy has granted privileges to this actor. + return new AuthorizationResult(request, AuthorizationResult.Type.ALLOW, + String.format("Granted by policy with type: %s", policy.getType())); + } } + return new AuthorizationResult(request, AuthorizationResult.Type.DENY, null); + } finally { + _lockPolicyCache.readLock().unlock(); } - return new AuthorizationResult(request, AuthorizationResult.Type.DENY, null); } - public List getGrantedPrivileges(final String actorUrn, final Optional resourceSpec) { + public List getGrantedPrivileges(final String actor, final Optional resourceSpec) { + + _lockPolicyCache.readLock().lock(); + try { + // 1. Fetch all policies + final List policiesToEvaluate = _policyCache.getOrDefault(ALL, new ArrayList<>()); - // 1. Fetch all policies - final List policiesToEvaluate = _policyCache.getOrDefault(ALL, new ArrayList<>()); + Urn actorUrn = UrnUtils.getUrn(actor); + final ResolvedEntitySpec resolvedActorSpec = _entitySpecResolver.resolve(new EntitySpec(actorUrn.getEntityType(), actor)); - Optional resolvedResourceSpec = resourceSpec.map(_resourceSpecResolver::resolve); + Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve); - return _policyEngine.getGrantedPrivileges(policiesToEvaluate, UrnUtils.getUrn(actorUrn), resolvedResourceSpec); + return _policyEngine.getGrantedPrivileges(policiesToEvaluate, resolvedActorSpec, resolvedResourceSpec); + } finally { + _lockPolicyCache.readLock().unlock(); + } } /** @@ -118,37 +136,43 @@ public List getGrantedPrivileges(final String actorUrn, final Optional resourceSpec) { - // Step 1: Find policies granting the privilege. - final List policiesToEvaluate = _policyCache.getOrDefault(privilege, new ArrayList<>()); - - Optional resolvedResourceSpec = resourceSpec.map(_resourceSpecResolver::resolve); + final Optional resourceSpec) { final List authorizedUsers = new ArrayList<>(); final List authorizedGroups = new ArrayList<>(); boolean allUsers = false; boolean allGroups = false; - // Step 2: For each policy, determine whether the resource is a match. - for (DataHubPolicyInfo policy : policiesToEvaluate) { - if (!PoliciesConfig.ACTIVE_POLICY_STATE.equals(policy.getState())) { - // Policy is not active, skip. - continue; - } + _lockPolicyCache.readLock().lock(); + try { + // Step 1: Find policies granting the privilege. + final List policiesToEvaluate = _policyCache.getOrDefault(privilege, new ArrayList<>()); - final PolicyEngine.PolicyActors matchingActors = _policyEngine.getMatchingActors(policy, resolvedResourceSpec); + Optional resolvedResourceSpec = resourceSpec.map(_entitySpecResolver::resolve); - // Step 3: For each matching policy, add actors that are authorized. - authorizedUsers.addAll(matchingActors.getUsers()); - authorizedGroups.addAll(matchingActors.getGroups()); - if (matchingActors.allUsers()) { - allUsers = true; - } - if (matchingActors.allGroups()) { - allGroups = true; + + // Step 2: For each policy, determine whether the resource is a match. + for (DataHubPolicyInfo policy : policiesToEvaluate) { + if (!PoliciesConfig.ACTIVE_POLICY_STATE.equals(policy.getState())) { + // Policy is not active, skip. + continue; + } + + final PolicyEngine.PolicyActors matchingActors = _policyEngine.getMatchingActors(policy, resolvedResourceSpec); + + // Step 3: For each matching policy, add actors that are authorized. + authorizedUsers.addAll(matchingActors.getUsers()); + authorizedGroups.addAll(matchingActors.getGroups()); + if (matchingActors.allUsers()) { + allUsers = true; + } + if (matchingActors.allGroups()) { + allGroups = true; + } } + } finally { + _lockPolicyCache.readLock().unlock(); } - // Step 4: Return all authorized users and groups. return new AuthorizedActors(privilege, authorizedUsers, authorizedGroups, allUsers, allGroups); } @@ -180,19 +204,36 @@ private boolean isSystemRequest(final AuthorizationRequest request, final Authen /** * Returns true if a policy grants the requested privilege for a given actor and resource. */ - private boolean isRequestGranted(final DataHubPolicyInfo policy, final AuthorizationRequest request, final Optional resourceSpec) { + private boolean isRequestGranted(final DataHubPolicyInfo policy, final AuthorizationRequest request, final Optional resourceSpec) { if (AuthorizationMode.ALLOW_ALL.equals(mode())) { return true; } + + Optional actorUrn = getUrnFromRequestActor(request.getActorUrn()); + if (actorUrn.isEmpty()) { + return false; + } + + final ResolvedEntitySpec resolvedActorSpec = _entitySpecResolver.resolve( + new EntitySpec(actorUrn.get().getEntityType(), request.getActorUrn())); final PolicyEngine.PolicyEvaluationResult result = _policyEngine.evaluatePolicy( policy, - request.getActorUrn(), + resolvedActorSpec, request.getPrivilege(), resourceSpec ); return result.isGranted(); } + private Optional getUrnFromRequestActor(String actor) { + try { + return Optional.of(Urn.createFromString(actor)); + } catch (URISyntaxException e) { + log.error(String.format("Failed to bind actor %s to an URN. Actors must be URNs. Denying the authorization request", actor)); + return Optional.empty(); + } + } + /** * A {@link Runnable} used to periodically fetch a new instance of the policies Cache. * @@ -206,6 +247,7 @@ static class PolicyRefreshRunnable implements Runnable { private final Authentication _systemAuthentication; private final PolicyFetcher _policyFetcher; private final Map> _policyCache; + private final ReadWriteLock _lockPolicyCache; @Override public void run() { @@ -231,10 +273,13 @@ public void run() { "Failed to retrieve policy urns! Skipping updating policy cache until next refresh. start: {}, count: {}", start, count, e); return; } - synchronized (_policyCache) { - _policyCache.clear(); - _policyCache.putAll(newCache); - } + } + _lockPolicyCache.writeLock().lock(); + try { + _policyCache.clear(); + _policyCache.putAll(newCache); + } finally { + _lockPolicyCache.writeLock().unlock(); } log.debug(String.format("Successfully fetched %s policies.", total)); } catch (Exception e) { diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java new file mode 100644 index 00000000000000..65b0329a9c4f25 --- /dev/null +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultEntitySpecResolver.java @@ -0,0 +1,41 @@ +package com.datahub.authorization; + +import com.datahub.authentication.Authentication; +import com.datahub.authorization.fieldresolverprovider.DataPlatformInstanceFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.DomainFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.EntityFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.EntityTypeFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.EntityUrnFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.GroupMembershipFieldResolverProvider; +import com.datahub.authorization.fieldresolverprovider.OwnerFieldResolverProvider; +import com.google.common.collect.ImmutableList; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.util.Pair; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + + +public class DefaultEntitySpecResolver implements EntitySpecResolver { + private final List _entityFieldResolverProviders; + + public DefaultEntitySpecResolver(Authentication systemAuthentication, EntityClient entityClient) { + _entityFieldResolverProviders = + ImmutableList.of(new EntityTypeFieldResolverProvider(), new EntityUrnFieldResolverProvider(), + new DomainFieldResolverProvider(entityClient, systemAuthentication), + new OwnerFieldResolverProvider(entityClient, systemAuthentication), + new DataPlatformInstanceFieldResolverProvider(entityClient, systemAuthentication), + new GroupMembershipFieldResolverProvider(entityClient, systemAuthentication)); + } + + @Override + public ResolvedEntitySpec resolve(EntitySpec entitySpec) { + return new ResolvedEntitySpec(entitySpec, getFieldResolvers(entitySpec)); + } + + private Map getFieldResolvers(EntitySpec entitySpec) { + return _entityFieldResolverProviders.stream() + .flatMap(resolver -> resolver.getFieldTypes().stream().map(fieldType -> Pair.of(fieldType, resolver))) + .collect(Collectors.toMap(Pair::getKey, pair -> pair.getValue().getFieldResolver(entitySpec))); + } +} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultResourceSpecResolver.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultResourceSpecResolver.java deleted file mode 100644 index cd4e0b09678296..00000000000000 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/DefaultResourceSpecResolver.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.datahub.authorization; - -import com.datahub.authorization.fieldresolverprovider.EntityTypeFieldResolverProvider; -import com.datahub.authorization.fieldresolverprovider.OwnerFieldResolverProvider; -import com.datahub.authentication.Authentication; -import com.datahub.authorization.fieldresolverprovider.DomainFieldResolverProvider; -import com.datahub.authorization.fieldresolverprovider.EntityUrnFieldResolverProvider; -import com.datahub.authorization.fieldresolverprovider.ResourceFieldResolverProvider; -import com.google.common.collect.ImmutableList; -import com.linkedin.entity.client.EntityClient; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - - -public class DefaultResourceSpecResolver implements ResourceSpecResolver { - private final List _resourceFieldResolverProviders; - - public DefaultResourceSpecResolver(Authentication systemAuthentication, EntityClient entityClient) { - _resourceFieldResolverProviders = - ImmutableList.of(new EntityTypeFieldResolverProvider(), new EntityUrnFieldResolverProvider(), - new DomainFieldResolverProvider(entityClient, systemAuthentication), - new OwnerFieldResolverProvider(entityClient, systemAuthentication)); - } - - @Override - public ResolvedResourceSpec resolve(ResourceSpec resourceSpec) { - return new ResolvedResourceSpec(resourceSpec, getFieldResolvers(resourceSpec)); - } - - private Map getFieldResolvers(ResourceSpec resourceSpec) { - return _resourceFieldResolverProviders.stream() - .collect(Collectors.toMap(ResourceFieldResolverProvider::getFieldType, - hydrator -> hydrator.getFieldResolver(resourceSpec))); - } -} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/FilterUtils.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/FilterUtils.java index 76ed18e2baf783..0dbb9cd132f8a8 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/FilterUtils.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/FilterUtils.java @@ -26,7 +26,7 @@ private FilterUtils() { * Creates new PolicyMatchCriterion with field and value, using EQUAL PolicyMatchCondition. */ @Nonnull - public static PolicyMatchCriterion newCriterion(@Nonnull ResourceFieldType field, @Nonnull List values) { + public static PolicyMatchCriterion newCriterion(@Nonnull EntityFieldType field, @Nonnull List values) { return newCriterion(field, values, PolicyMatchCondition.EQUALS); } @@ -34,7 +34,7 @@ public static PolicyMatchCriterion newCriterion(@Nonnull ResourceFieldType field * Creates new PolicyMatchCriterion with field, value and PolicyMatchCondition. */ @Nonnull - public static PolicyMatchCriterion newCriterion(@Nonnull ResourceFieldType field, @Nonnull List values, + public static PolicyMatchCriterion newCriterion(@Nonnull EntityFieldType field, @Nonnull List values, @Nonnull PolicyMatchCondition policyMatchCondition) { return new PolicyMatchCriterion().setField(field.name()) .setValues(new StringArray(values)) @@ -45,7 +45,7 @@ public static PolicyMatchCriterion newCriterion(@Nonnull ResourceFieldType field * Creates new PolicyMatchFilter from a map of Criteria by removing null-valued Criteria and using EQUAL PolicyMatchCondition (default). */ @Nonnull - public static PolicyMatchFilter newFilter(@Nullable Map> params) { + public static PolicyMatchFilter newFilter(@Nullable Map> params) { if (params == null) { return EMPTY_FILTER; } @@ -61,7 +61,7 @@ public static PolicyMatchFilter newFilter(@Nullable Map values) { + public static PolicyMatchFilter newFilter(@Nonnull EntityFieldType field, @Nonnull List values) { return newFilter(Collections.singletonMap(field, values)); } } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java index 6a36fac7de4e00..f8c017ea74e1f6 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/PolicyEngine.java @@ -1,7 +1,6 @@ package com.datahub.authorization; import com.datahub.authentication.Authentication; -import com.google.common.collect.ImmutableSet; import com.linkedin.common.Owner; import com.linkedin.common.Ownership; import com.linkedin.common.urn.Urn; @@ -11,8 +10,6 @@ import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; import com.linkedin.entity.client.EntityClient; -import com.linkedin.identity.GroupMembership; -import com.linkedin.identity.NativeGroupMembership; import com.linkedin.identity.RoleMembership; import com.linkedin.metadata.Constants; import com.linkedin.metadata.authorization.PoliciesConfig; @@ -23,7 +20,7 @@ import com.linkedin.policy.PolicyMatchCriterion; import com.linkedin.policy.PolicyMatchCriterionArray; import com.linkedin.policy.PolicyMatchFilter; -import java.net.URISyntaxException; + import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; @@ -34,6 +31,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import javax.annotation.Nullable; + import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; @@ -49,37 +47,22 @@ public class PolicyEngine { public PolicyEvaluationResult evaluatePolicy( final DataHubPolicyInfo policy, - final String actorStr, + final ResolvedEntitySpec resolvedActorSpec, final String privilege, - final Optional resource) { - try { - // Currently Actor must be an urn. Consider whether this contract should be pushed up. - final Urn actor = Urn.createFromString(actorStr); - return evaluatePolicy(policy, actor, privilege, resource); - } catch (URISyntaxException e) { - log.error(String.format("Failed to bind actor %s to an URN. Actors must be URNs. Denying the authorization request", actorStr)); - return PolicyEvaluationResult.DENIED; - } - } - - public PolicyEvaluationResult evaluatePolicy( - final DataHubPolicyInfo policy, - final Urn actor, - final String privilege, - final Optional resource) { + final Optional resource) { final PolicyEvaluationContext context = new PolicyEvaluationContext(); log.debug("Evaluating policy {}", policy.getDisplayName()); // If the privilege is not in scope, deny the request. - if (!isPrivilegeMatch(privilege, policy.getPrivileges(), context)) { + if (!isPrivilegeMatch(privilege, policy.getPrivileges())) { log.debug("Policy denied based on irrelevant privileges {} for {}", policy.getPrivileges(), privilege); return PolicyEvaluationResult.DENIED; } // If policy is not applicable, deny the request - if (!isPolicyApplicable(policy, actor, resource, context)) { - log.debug("Policy does not applicable for actor {} and resource {}", actor, resource); + if (!isPolicyApplicable(policy, resolvedActorSpec, resource, context)) { + log.debug("Policy does not applicable for actor {} and resource {}", resolvedActorSpec.getSpec().getEntity(), resource); return PolicyEvaluationResult.DENIED; } @@ -89,7 +72,7 @@ public PolicyEvaluationResult evaluatePolicy( public PolicyActors getMatchingActors( final DataHubPolicyInfo policy, - final Optional resource) { + final Optional resource) { final List users = new ArrayList<>(); final List groups = new ArrayList<>(); boolean allUsers = false; @@ -126,8 +109,8 @@ public PolicyActors getMatchingActors( private boolean isPolicyApplicable( final DataHubPolicyInfo policy, - final Urn actor, - final Optional resource, + final ResolvedEntitySpec resolvedActorSpec, + final Optional resource, final PolicyEvaluationContext context ) { @@ -137,25 +120,21 @@ private boolean isPolicyApplicable( } // If the resource is not in scope, deny the request. - if (!isResourceMatch(policy.getType(), policy.getResources(), resource, context)) { + if (!isResourceMatch(policy.getType(), policy.getResources(), resource)) { return false; } // If the actor does not match, deny the request. - if (!isActorMatch(actor, policy.getActors(), resource, context)) { - return false; - } - - return true; + return isActorMatch(resolvedActorSpec, policy.getActors(), resource, context); } public List getGrantedPrivileges( final List policies, - final Urn actor, - final Optional resource) { + final ResolvedEntitySpec resolvedActorSpec, + final Optional resource) { PolicyEvaluationContext context = new PolicyEvaluationContext(); return policies.stream() - .filter(policy -> isPolicyApplicable(policy, actor, resource, context)) + .filter(policy -> isPolicyApplicable(policy, resolvedActorSpec, resource, context)) .flatMap(policy -> policy.getPrivileges().stream()) .distinct() .collect(Collectors.toList()); @@ -168,9 +147,8 @@ public List getGrantedPrivileges( * If the policy is of type "METADATA", the resourceSpec parameter will be matched against the * resource filter defined on the policy. */ - public Boolean policyMatchesResource(final DataHubPolicyInfo policy, final Optional resourceSpec) { - return isResourceMatch(policy.getType(), policy.getResources(), resourceSpec, - new PolicyEvaluationContext()); + public Boolean policyMatchesResource(final DataHubPolicyInfo policy, final Optional resourceSpec) { + return isResourceMatch(policy.getType(), policy.getResources(), resourceSpec); } /** @@ -178,8 +156,7 @@ public Boolean policyMatchesResource(final DataHubPolicyInfo policy, final Optio */ private boolean isPrivilegeMatch( final String requestPrivilege, - final List policyPrivileges, - final PolicyEvaluationContext context) { + final List policyPrivileges) { return policyPrivileges.contains(requestPrivilege); } @@ -189,8 +166,7 @@ private boolean isPrivilegeMatch( private boolean isResourceMatch( final String policyType, final @Nullable DataHubResourceFilter policyResourceFilter, - final Optional requestResource, - final PolicyEvaluationContext context) { + final Optional requestResource) { if (PoliciesConfig.PLATFORM_POLICY_TYPE.equals(policyType)) { // Currently, platform policies have no associated resource. return true; @@ -199,7 +175,7 @@ private boolean isResourceMatch( // No resource defined on the policy. return true; } - if (!requestResource.isPresent()) { + if (requestResource.isEmpty()) { // Resource filter present in policy, but no resource spec provided. log.debug("Resource filter present in policy, but no resource spec provided."); return false; @@ -218,31 +194,31 @@ private PolicyMatchFilter getFilter(DataHubResourceFilter policyResourceFilter) } PolicyMatchCriterionArray criteria = new PolicyMatchCriterionArray(); if (policyResourceFilter.hasType()) { - criteria.add(new PolicyMatchCriterion().setField(ResourceFieldType.RESOURCE_TYPE.name()) + criteria.add(new PolicyMatchCriterion().setField(EntityFieldType.TYPE.name()) .setValues(new StringArray(Collections.singletonList(policyResourceFilter.getType())))); } if (policyResourceFilter.hasType() && policyResourceFilter.hasResources() && !policyResourceFilter.isAllResources()) { criteria.add( - new PolicyMatchCriterion().setField(ResourceFieldType.RESOURCE_URN.name()).setValues(policyResourceFilter.getResources())); + new PolicyMatchCriterion().setField(EntityFieldType.URN.name()).setValues(policyResourceFilter.getResources())); } return new PolicyMatchFilter().setCriteria(criteria); } - private boolean checkFilter(final PolicyMatchFilter filter, final ResolvedResourceSpec resource) { + private boolean checkFilter(final PolicyMatchFilter filter, final ResolvedEntitySpec resource) { return filter.getCriteria().stream().allMatch(criterion -> checkCriterion(criterion, resource)); } - private boolean checkCriterion(final PolicyMatchCriterion criterion, final ResolvedResourceSpec resource) { - ResourceFieldType resourceFieldType; + private boolean checkCriterion(final PolicyMatchCriterion criterion, final ResolvedEntitySpec resource) { + EntityFieldType entityFieldType; try { - resourceFieldType = ResourceFieldType.valueOf(criterion.getField().toUpperCase()); + entityFieldType = EntityFieldType.valueOf(criterion.getField().toUpperCase()); } catch (IllegalArgumentException e) { log.error("Unsupported field type {}", criterion.getField()); return false; } - Set fieldValues = resource.getFieldValues(resourceFieldType); + Set fieldValues = resource.getFieldValues(entityFieldType); return criterion.getValues() .stream() .anyMatch(filterValue -> checkCondition(fieldValues, filterValue, criterion.getCondition())); @@ -257,46 +233,51 @@ private boolean checkCondition(Set fieldValues, String filterValue, Poli } /** + * Returns true if the actor portion of a DataHub policy matches a the actor being evaluated, false otherwise. * Returns true if the actor portion of a DataHub policy matches a the actor being evaluated, false otherwise. */ private boolean isActorMatch( - final Urn actor, + final ResolvedEntitySpec resolvedActorSpec, final DataHubActorFilter actorFilter, - final Optional resourceSpec, + final Optional resourceSpec, final PolicyEvaluationContext context) { // 1. If the actor is a matching "User" in the actor filter, return true immediately. - if (isUserMatch(actor, actorFilter)) { + if (isUserMatch(resolvedActorSpec, actorFilter)) { return true; } // 2. If the actor is in a matching "Group" in the actor filter, return true immediately. - if (isGroupMatch(actor, actorFilter, context)) { + if (isGroupMatch(resolvedActorSpec, actorFilter, context)) { return true; } // 3. If the actor is the owner, either directly or indirectly via a group, return true immediately. - if (isOwnerMatch(actor, actorFilter, resourceSpec, context)) { + if (isOwnerMatch(resolvedActorSpec, actorFilter, resourceSpec, context)) { return true; } // 4. If the actor is in a matching "Role" in the actor filter, return true immediately. - return isRoleMatch(actor, actorFilter, context); + return isRoleMatch(resolvedActorSpec, actorFilter, context); } - private boolean isUserMatch(final Urn actor, final DataHubActorFilter actorFilter) { + private boolean isUserMatch(final ResolvedEntitySpec resolvedActorSpec, final DataHubActorFilter actorFilter) { // If the actor is a matching "User" in the actor filter, return true immediately. return actorFilter.isAllUsers() || (actorFilter.hasUsers() && Objects.requireNonNull(actorFilter.getUsers()) - .stream() - .anyMatch(user -> user.equals(actor))); + .stream().map(Urn::toString) + .anyMatch(user -> user.equals(resolvedActorSpec.getSpec().getEntity()))); } - private boolean isGroupMatch(final Urn actor, final DataHubActorFilter actorFilter, final PolicyEvaluationContext context) { + private boolean isGroupMatch( + final ResolvedEntitySpec resolvedActorSpec, + final DataHubActorFilter actorFilter, + final PolicyEvaluationContext context) { // If the actor is in a matching "Group" in the actor filter, return true immediately. if (actorFilter.isAllGroups() || actorFilter.hasGroups()) { - final Set groups = resolveGroups(actor, context); - return actorFilter.isAllGroups() || (actorFilter.hasGroups() && Objects.requireNonNull(actorFilter.getGroups()) - .stream() + final Set groups = resolveGroups(resolvedActorSpec, context); + return (actorFilter.isAllGroups() && !groups.isEmpty()) + || (actorFilter.hasGroups() && Objects.requireNonNull(actorFilter.getGroups()) + .stream().map(Urn::toString) .anyMatch(groups::contains)); } // If there are no groups on the policy, return false for the group match. @@ -304,24 +285,24 @@ private boolean isGroupMatch(final Urn actor, final DataHubActorFilter actorFilt } private boolean isOwnerMatch( - final Urn actor, + final ResolvedEntitySpec resolvedActorSpec, final DataHubActorFilter actorFilter, - final Optional requestResource, + final Optional requestResource, final PolicyEvaluationContext context) { // If the policy does not apply to owners, or there is no resource to own, return false immediately. - if (!actorFilter.isResourceOwners() || !requestResource.isPresent()) { + if (!actorFilter.isResourceOwners() || requestResource.isEmpty()) { return false; } List ownershipTypes = actorFilter.getResourceOwnersTypes(); - return isActorOwner(actor, requestResource.get(), ownershipTypes, context); + return isActorOwner(resolvedActorSpec, requestResource.get(), ownershipTypes, context); } - private Set getOwnersForType(ResourceSpec resourceSpec, List ownershipTypes) { - Urn entityUrn = UrnUtils.getUrn(resourceSpec.getResource()); + private Set getOwnersForType(EntitySpec resourceSpec, List ownershipTypes) { + Urn entityUrn = UrnUtils.getUrn(resourceSpec.getEntity()); EnvelopedAspect ownershipAspect; try { EntityResponse response = _entityClient.getV2(entityUrn.getEntityType(), entityUrn, - Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME), _systemAuthentication); + Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME), _systemAuthentication); if (response == null || !response.getAspects().containsKey(Constants.OWNERSHIP_ASPECT_NAME)) { return Collections.emptySet(); } @@ -338,50 +319,56 @@ private Set getOwnersForType(ResourceSpec resourceSpec, List owners return ownersStream.map(owner -> owner.getOwner().toString()).collect(Collectors.toSet()); } - private boolean isActorOwner(Urn actor, ResolvedResourceSpec resourceSpec, List ownershipTypes, PolicyEvaluationContext context) { + private boolean isActorOwner( + final ResolvedEntitySpec resolvedActorSpec, + ResolvedEntitySpec resourceSpec, List ownershipTypes, + PolicyEvaluationContext context) { Set owners = this.getOwnersForType(resourceSpec.getSpec(), ownershipTypes); - if (isUserOwner(actor, owners)) { - return true; - } - final Set groups = resolveGroups(actor, context); - if (isGroupOwner(groups, owners)) { + if (isUserOwner(resolvedActorSpec, owners)) { return true; } - return false; + final Set groups = resolveGroups(resolvedActorSpec, context); + + return isGroupOwner(groups, owners); } - private boolean isUserOwner(Urn actor, Set owners) { - return owners.contains(actor.toString()); + private boolean isUserOwner(final ResolvedEntitySpec resolvedActorSpec, Set owners) { + return owners.contains(resolvedActorSpec.getSpec().getEntity()); } - private boolean isGroupOwner(Set groups, Set owners) { - return groups.stream().anyMatch(group -> owners.contains(group.toString())); + private boolean isGroupOwner(Set groups, Set owners) { + return groups.stream().anyMatch(owners::contains); } - private boolean isRoleMatch(final Urn actor, final DataHubActorFilter actorFilter, + private boolean isRoleMatch( + final ResolvedEntitySpec resolvedActorSpec, + final DataHubActorFilter actorFilter, final PolicyEvaluationContext context) { // Can immediately return false if the actor filter does not have any roles if (!actorFilter.hasRoles()) { return false; } // If the actor has a matching "Role" in the actor filter, return true immediately. - Set actorRoles = resolveRoles(actor, context); + Set actorRoles = resolveRoles(resolvedActorSpec, context); return Objects.requireNonNull(actorFilter.getRoles()) .stream() .anyMatch(actorRoles::contains); } - private Set resolveRoles(Urn actor, PolicyEvaluationContext context) { + private Set resolveRoles(final ResolvedEntitySpec resolvedActorSpec, PolicyEvaluationContext context) { if (context.roles != null) { return context.roles; } + String actor = resolvedActorSpec.getSpec().getEntity(); + Set roles = new HashSet<>(); final EnvelopedAspectMap aspectMap; try { - final EntityResponse corpUser = _entityClient.batchGetV2(CORP_USER_ENTITY_NAME, Collections.singleton(actor), - Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME), _systemAuthentication).get(actor); + Urn actorUrn = Urn.createFromString(actor); + final EntityResponse corpUser = _entityClient.batchGetV2(CORP_USER_ENTITY_NAME, Collections.singleton(actorUrn), + Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME), _systemAuthentication).get(actorUrn); if (corpUser == null || !corpUser.hasAspects()) { return roles; } @@ -403,62 +390,25 @@ private Set resolveRoles(Urn actor, PolicyEvaluationContext context) { return roles; } - private Set resolveGroups(Urn actor, PolicyEvaluationContext context) { + private Set resolveGroups(ResolvedEntitySpec resolvedActorSpec, PolicyEvaluationContext context) { if (context.groups != null) { return context.groups; } - Set groups = new HashSet<>(); - final EnvelopedAspectMap aspectMap; - - try { - final EntityResponse corpUser = _entityClient.batchGetV2(CORP_USER_ENTITY_NAME, Collections.singleton(actor), - ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME), _systemAuthentication) - .get(actor); - if (corpUser == null || !corpUser.hasAspects()) { - return groups; - } - aspectMap = corpUser.getAspects(); - } catch (Exception e) { - throw new RuntimeException(String.format("Failed to fetch %s and %s for urn %s", GROUP_MEMBERSHIP_ASPECT_NAME, - NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME, actor), e); - } - - Optional maybeGroupMembership = resolveGroupMembership(aspectMap); - maybeGroupMembership.ifPresent(groupMembership -> groups.addAll(groupMembership.getGroups())); - - Optional maybeNativeGroupMembership = resolveNativeGroupMembership(aspectMap); - maybeNativeGroupMembership.ifPresent( - nativeGroupMembership -> groups.addAll(nativeGroupMembership.getNativeGroups())); + Set groups = resolvedActorSpec.getGroupMembership(); context.setGroups(groups); // Cache the groups. return groups; } - // TODO: Optimization - Cache the group membership. Refresh periodically. - private Optional resolveGroupMembership(final EnvelopedAspectMap aspectMap) { - if (aspectMap.containsKey(GROUP_MEMBERSHIP_ASPECT_NAME)) { - return Optional.of(new GroupMembership(aspectMap.get(GROUP_MEMBERSHIP_ASPECT_NAME).getValue().data())); - } - return Optional.empty(); - } - - private Optional resolveNativeGroupMembership(final EnvelopedAspectMap aspectMap) { - if (aspectMap.containsKey(NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)) { - return Optional.of( - new NativeGroupMembership(aspectMap.get(NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME).getValue().data())); - } - return Optional.empty(); - } - /** * Class used to store state across a single Policy evaluation. */ static class PolicyEvaluationContext { - private Set groups; + private Set groups; private Set roles; - public void setGroups(Set groups) { + public void setGroups(Set groups) { this.groups = groups; } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java new file mode 100644 index 00000000000000..cbb237654e9693 --- /dev/null +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProvider.java @@ -0,0 +1,70 @@ +package com.datahub.authorization.fieldresolverprovider; + +import com.datahub.authentication.Authentication; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.datahub.authorization.FieldResolver; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.client.EntityClient; +import java.util.Collections; +import java.util.List; +import java.util.Objects; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import static com.linkedin.metadata.Constants.*; + +/** + * Provides field resolver for domain given resourceSpec + */ +@Slf4j +@RequiredArgsConstructor +public class DataPlatformInstanceFieldResolverProvider implements EntityFieldResolverProvider { + + private final EntityClient _entityClient; + private final Authentication _systemAuthentication; + + @Override + public List getFieldTypes() { + return Collections.singletonList(EntityFieldType.DATA_PLATFORM_INSTANCE); + } + + @Override + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromFunction(entitySpec, this::getDataPlatformInstance); + } + + private FieldResolver.FieldValue getDataPlatformInstance(EntitySpec entitySpec) { + Urn entityUrn = UrnUtils.getUrn(entitySpec.getEntity()); + // In the case that the entity is a platform instance, the associated platform instance entity is the instance itself + if (entityUrn.getEntityType().equals(DATA_PLATFORM_INSTANCE_ENTITY_NAME)) { + return FieldResolver.FieldValue.builder() + .values(Collections.singleton(entityUrn.toString())) + .build(); + } + + EnvelopedAspect dataPlatformInstanceAspect; + try { + EntityResponse response = _entityClient.getV2(entityUrn.getEntityType(), entityUrn, + Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME), _systemAuthentication); + if (response == null || !response.getAspects().containsKey(DATA_PLATFORM_INSTANCE_ASPECT_NAME)) { + return FieldResolver.emptyFieldValue(); + } + dataPlatformInstanceAspect = response.getAspects().get(DATA_PLATFORM_INSTANCE_ASPECT_NAME); + } catch (Exception e) { + log.error("Error while retrieving platform instance aspect for urn {}", entityUrn, e); + return FieldResolver.emptyFieldValue(); + } + DataPlatformInstance dataPlatformInstance = new DataPlatformInstance(dataPlatformInstanceAspect.getValue().data()); + if (dataPlatformInstance.getInstance() == null) { + return FieldResolver.emptyFieldValue(); + } + return FieldResolver.FieldValue.builder() + .values(Collections.singleton(Objects.requireNonNull(dataPlatformInstance.getInstance()).toString())) + .build(); + } +} \ No newline at end of file diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java index 68c1dd4f644e59..15d821b75c0bdd 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/DomainFieldResolverProvider.java @@ -2,8 +2,8 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; import com.linkedin.domain.DomainProperties; @@ -14,6 +14,7 @@ import java.util.Collections; import java.util.HashSet; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; @@ -27,23 +28,23 @@ /** - * Provides field resolver for domain given resourceSpec + * Provides field resolver for domain given entitySpec */ @Slf4j @RequiredArgsConstructor -public class DomainFieldResolverProvider implements ResourceFieldResolverProvider { +public class DomainFieldResolverProvider implements EntityFieldResolverProvider { private final EntityClient _entityClient; private final Authentication _systemAuthentication; @Override - public ResourceFieldType getFieldType() { - return ResourceFieldType.DOMAIN; + public List getFieldTypes() { + return Collections.singletonList(EntityFieldType.DOMAIN); } @Override - public FieldResolver getFieldResolver(ResourceSpec resourceSpec) { - return FieldResolver.getResolverFromFunction(resourceSpec, this::getDomains); + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromFunction(entitySpec, this::getDomains); } private Set getBatchedParentDomains(@Nonnull final Set urns) { @@ -78,8 +79,8 @@ private Set getBatchedParentDomains(@Nonnull final Set urns) { return parentUrns; } - private FieldResolver.FieldValue getDomains(ResourceSpec resourceSpec) { - final Urn entityUrn = UrnUtils.getUrn(resourceSpec.getResource()); + private FieldResolver.FieldValue getDomains(EntitySpec entitySpec) { + final Urn entityUrn = UrnUtils.getUrn(entitySpec.getEntity()); // In the case that the entity is a domain, the associated domain is the domain itself if (entityUrn.getEntityType().equals(DOMAIN_ENTITY_NAME)) { return FieldResolver.FieldValue.builder() diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java new file mode 100644 index 00000000000000..227d403a9cd1d1 --- /dev/null +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityFieldResolverProvider.java @@ -0,0 +1,24 @@ +package com.datahub.authorization.fieldresolverprovider; + +import com.datahub.authorization.FieldResolver; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import java.util.List; + + +/** + * Base class for defining a class that provides the field resolver for the given field type + */ +public interface EntityFieldResolverProvider { + + /** + * List of fields that this hydrator is hydrating. + * @return + */ + List getFieldTypes(); + + /** + * Return resolver for fetching the field values given the entity + */ + FieldResolver getFieldResolver(EntitySpec entitySpec); +} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java index 58e3d78ce8c3b5..addac84c68b185 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityTypeFieldResolverProvider.java @@ -1,22 +1,25 @@ package com.datahub.authorization.fieldresolverprovider; import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.datastax.oss.driver.shaded.guava.common.collect.ImmutableList; import java.util.Collections; +import java.util.List; /** - * Provides field resolver for entity type given resourceSpec + * Provides field resolver for entity type given entitySpec */ -public class EntityTypeFieldResolverProvider implements ResourceFieldResolverProvider { +public class EntityTypeFieldResolverProvider implements EntityFieldResolverProvider { + @Override - public ResourceFieldType getFieldType() { - return ResourceFieldType.RESOURCE_TYPE; + public List getFieldTypes() { + return ImmutableList.of(EntityFieldType.TYPE, EntityFieldType.RESOURCE_TYPE); } @Override - public FieldResolver getFieldResolver(ResourceSpec resourceSpec) { - return FieldResolver.getResolverFromValues(Collections.singleton(resourceSpec.getType())); + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromValues(Collections.singleton(entitySpec.getType())); } } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java index b9d98f1dcbac09..32960de687839a 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/EntityUrnFieldResolverProvider.java @@ -1,22 +1,25 @@ package com.datahub.authorization.fieldresolverprovider; import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.datastax.oss.driver.shaded.guava.common.collect.ImmutableList; import java.util.Collections; +import java.util.List; /** - * Provides field resolver for entity urn given resourceSpec + * Provides field resolver for entity urn given entitySpec */ -public class EntityUrnFieldResolverProvider implements ResourceFieldResolverProvider { +public class EntityUrnFieldResolverProvider implements EntityFieldResolverProvider { + @Override - public ResourceFieldType getFieldType() { - return ResourceFieldType.RESOURCE_URN; + public List getFieldTypes() { + return ImmutableList.of(EntityFieldType.URN, EntityFieldType.RESOURCE_URN); } @Override - public FieldResolver getFieldResolver(ResourceSpec resourceSpec) { - return FieldResolver.getResolverFromValues(Collections.singleton(resourceSpec.getResource())); + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromValues(Collections.singleton(entitySpec.getEntity())); } } diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java new file mode 100644 index 00000000000000..b1202d9f4bbd34 --- /dev/null +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProvider.java @@ -0,0 +1,79 @@ +package com.datahub.authorization.fieldresolverprovider; + +import com.datahub.authentication.Authentication; +import com.datahub.authorization.FieldResolver; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.google.common.collect.ImmutableSet; +import com.linkedin.common.urn.Urn; +import com.linkedin.common.urn.UrnUtils; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.identity.NativeGroupMembership; +import com.linkedin.metadata.Constants; +import com.linkedin.identity.GroupMembership; +import java.util.Collections; +import lombok.RequiredArgsConstructor; +import lombok.extern.slf4j.Slf4j; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import static com.linkedin.metadata.Constants.GROUP_MEMBERSHIP_ASPECT_NAME; +import static com.linkedin.metadata.Constants.NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME; + + +/** + * Provides field resolver for owners given entitySpec + */ +@Slf4j +@RequiredArgsConstructor +public class GroupMembershipFieldResolverProvider implements EntityFieldResolverProvider { + + private final EntityClient _entityClient; + private final Authentication _systemAuthentication; + + @Override + public List getFieldTypes() { + return Collections.singletonList(EntityFieldType.GROUP_MEMBERSHIP); + } + + @Override + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromFunction(entitySpec, this::getGroupMembership); + } + + private FieldResolver.FieldValue getGroupMembership(EntitySpec entitySpec) { + Urn entityUrn = UrnUtils.getUrn(entitySpec.getEntity()); + EnvelopedAspect groupMembershipAspect; + EnvelopedAspect nativeGroupMembershipAspect; + List groups = new ArrayList<>(); + try { + EntityResponse response = _entityClient.getV2(entityUrn.getEntityType(), entityUrn, + ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME), _systemAuthentication); + if (response == null + || !(response.getAspects().containsKey(Constants.GROUP_MEMBERSHIP_ASPECT_NAME) + || response.getAspects().containsKey(Constants.NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME))) { + return FieldResolver.emptyFieldValue(); + } + if (response.getAspects().containsKey(Constants.GROUP_MEMBERSHIP_ASPECT_NAME)) { + groupMembershipAspect = response.getAspects().get(Constants.GROUP_MEMBERSHIP_ASPECT_NAME); + GroupMembership groupMembership = new GroupMembership(groupMembershipAspect.getValue().data()); + groups.addAll(groupMembership.getGroups()); + } + if (response.getAspects().containsKey(Constants.NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)) { + nativeGroupMembershipAspect = response.getAspects().get(Constants.NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME); + NativeGroupMembership nativeGroupMembership = new NativeGroupMembership(nativeGroupMembershipAspect.getValue().data()); + groups.addAll(nativeGroupMembership.getNativeGroups()); + } + } catch (Exception e) { + log.error("Error while retrieving group membership aspect for urn {}", entityUrn, e); + return FieldResolver.emptyFieldValue(); + } + return FieldResolver.FieldValue.builder() + .values(groups.stream().map(Urn::toString).collect(Collectors.toSet())) + .build(); + } +} diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java index 20ec6a09377c81..3c27f9e6ce8d79 100644 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java +++ b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/OwnerFieldResolverProvider.java @@ -2,8 +2,8 @@ import com.datahub.authentication.Authentication; import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; import com.linkedin.common.Ownership; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; @@ -12,33 +12,34 @@ import com.linkedin.entity.client.EntityClient; import com.linkedin.metadata.Constants; import java.util.Collections; +import java.util.List; import java.util.stream.Collectors; import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; /** - * Provides field resolver for owners given resourceSpec + * Provides field resolver for owners given entitySpec */ @Slf4j @RequiredArgsConstructor -public class OwnerFieldResolverProvider implements ResourceFieldResolverProvider { +public class OwnerFieldResolverProvider implements EntityFieldResolverProvider { private final EntityClient _entityClient; private final Authentication _systemAuthentication; @Override - public ResourceFieldType getFieldType() { - return ResourceFieldType.OWNER; + public List getFieldTypes() { + return Collections.singletonList(EntityFieldType.OWNER); } @Override - public FieldResolver getFieldResolver(ResourceSpec resourceSpec) { - return FieldResolver.getResolverFromFunction(resourceSpec, this::getOwners); + public FieldResolver getFieldResolver(EntitySpec entitySpec) { + return FieldResolver.getResolverFromFunction(entitySpec, this::getOwners); } - private FieldResolver.FieldValue getOwners(ResourceSpec resourceSpec) { - Urn entityUrn = UrnUtils.getUrn(resourceSpec.getResource()); + private FieldResolver.FieldValue getOwners(EntitySpec entitySpec) { + Urn entityUrn = UrnUtils.getUrn(entitySpec.getEntity()); EnvelopedAspect ownershipAspect; try { EntityResponse response = _entityClient.getV2(entityUrn.getEntityType(), entityUrn, diff --git a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/ResourceFieldResolverProvider.java b/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/ResourceFieldResolverProvider.java deleted file mode 100644 index 4ba4200f8035ed..00000000000000 --- a/metadata-service/auth-impl/src/main/java/com/datahub/authorization/fieldresolverprovider/ResourceFieldResolverProvider.java +++ /dev/null @@ -1,22 +0,0 @@ -package com.datahub.authorization.fieldresolverprovider; - -import com.datahub.authorization.FieldResolver; -import com.datahub.authorization.ResourceFieldType; -import com.datahub.authorization.ResourceSpec; - - -/** - * Base class for defining a class that provides the field resolver for the given field type - */ -public interface ResourceFieldResolverProvider { - - /** - * Field that this hydrator is hydrating - */ - ResourceFieldType getFieldType(); - - /** - * Return resolver for fetching the field values given the resource - */ - FieldResolver getFieldResolver(ResourceSpec resourceSpec); -} diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java index 2e48123fb1813f..24ecfa6fefc856 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/DataHubAuthorizerTest.java @@ -158,7 +158,7 @@ public void testSystemAuthentication() throws Exception { // Validate that the System Actor is authorized, even if there is no policy. - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( new Actor(ActorType.USER, DATAHUB_SYSTEM_CLIENT_ID).toUrnStr(), @@ -172,7 +172,7 @@ public void testSystemAuthentication() throws Exception { @Test public void testAuthorizeGranted() throws Exception { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -186,7 +186,7 @@ public void testAuthorizeGranted() throws Exception { @Test public void testAuthorizeNotGranted() throws Exception { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); // Policy for this privilege is inactive. AuthorizationRequest request = new AuthorizationRequest( @@ -203,7 +203,7 @@ public void testAllowAllMode() throws Exception { _dataHubAuthorizer.setMode(DataHubAuthorizer.AuthorizationMode.ALLOW_ALL); - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); // Policy for this privilege is inactive. AuthorizationRequest request = new AuthorizationRequest( @@ -219,7 +219,7 @@ public void testAllowAllMode() throws Exception { public void testInvalidateCache() throws Exception { // First make sure that the default policies are as expected. - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -250,7 +250,7 @@ public void testInvalidateCache() throws Exception { public void testAuthorizedActorsActivePolicy() throws Exception { final AuthorizedActors actors = _dataHubAuthorizer.authorizedActors("EDIT_ENTITY_TAGS", // Should be inside the active policy. - Optional.of(new ResourceSpec("dataset", "urn:li:dataset:1"))); + Optional.of(new EntitySpec("dataset", "urn:li:dataset:1"))); assertTrue(actors.isAllUsers()); assertTrue(actors.isAllGroups()); @@ -272,7 +272,7 @@ public void testAuthorizedActorsActivePolicy() throws Exception { @Test public void testAuthorizationOnDomainWithPrivilegeIsAllowed() { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -285,7 +285,7 @@ public void testAuthorizationOnDomainWithPrivilegeIsAllowed() { @Test public void testAuthorizationOnDomainWithParentPrivilegeIsAllowed() { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -298,7 +298,7 @@ public void testAuthorizationOnDomainWithParentPrivilegeIsAllowed() { @Test public void testAuthorizationOnDomainWithoutPrivilegeIsDenied() { - ResourceSpec resourceSpec = new ResourceSpec("dataset", "urn:li:dataset:test"); + EntitySpec resourceSpec = new EntitySpec("dataset", "urn:li:dataset:test"); AuthorizationRequest request = new AuthorizationRequest( "urn:li:corpuser:test", @@ -334,7 +334,7 @@ private DataHubPolicyInfo createDataHubPolicyInfo(boolean active, List p resourceFilter.setType("dataset"); if (domain != null) { - resourceFilter.setFilter(FilterUtils.newFilter(ImmutableMap.of(ResourceFieldType.DOMAIN, Collections.singletonList(domain.toString())))); + resourceFilter.setFilter(FilterUtils.newFilter(ImmutableMap.of(EntityFieldType.DOMAIN, Collections.singletonList(domain.toString())))); } dataHubPolicyInfo.setResources(resourceFilter); @@ -398,6 +398,6 @@ private Map createDomainPropertiesBatchResponse(@Nullable f } private AuthorizerContext createAuthorizerContext(final Authentication systemAuthentication, final EntityClient entityClient) { - return new AuthorizerContext(Collections.emptyMap(), new DefaultResourceSpecResolver(systemAuthentication, entityClient)); + return new AuthorizerContext(Collections.emptyMap(), new DefaultEntitySpecResolver(systemAuthentication, entityClient)); } } diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java index 99d8fee309d917..be8c948f8ef897 100644 --- a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/PolicyEngineTest.java @@ -11,15 +11,12 @@ import com.linkedin.common.OwnershipType; import com.linkedin.common.UrnArray; import com.linkedin.common.urn.Urn; -import com.linkedin.common.urn.UrnUtils; import com.linkedin.data.template.StringArray; import com.linkedin.entity.Aspect; import com.linkedin.entity.EntityResponse; import com.linkedin.entity.EnvelopedAspect; import com.linkedin.entity.EnvelopedAspectMap; import com.linkedin.entity.client.EntityClient; -import com.linkedin.identity.CorpUserInfo; -import com.linkedin.identity.GroupMembership; import com.linkedin.identity.RoleMembership; import com.linkedin.metadata.Constants; import com.linkedin.policy.DataHubActorFilter; @@ -45,22 +42,19 @@ public class PolicyEngineTest { private static final String AUTHORIZED_PRINCIPAL = "urn:li:corpuser:datahub"; private static final String UNAUTHORIZED_PRINCIPAL = "urn:li:corpuser:unauthorized"; - private static final String AUTHORIZED_GROUP = "urn:li:corpGroup:authorizedGroup"; - private static final String RESOURCE_URN = "urn:li:dataset:test"; - private static final String DOMAIN_URN = "urn:li:domain:domain1"; - private static final String OWNERSHIP_TYPE_URN = "urn:li:ownershipType:__system__technical_owner"; - private static final String OTHER_OWNERSHIP_TYPE_URN = "urn:li:ownershipType:__system__data_steward"; private EntityClient _entityClient; private PolicyEngine _policyEngine; private Urn authorizedUserUrn; + private ResolvedEntitySpec resolvedAuthorizedUserSpec; private Urn unauthorizedUserUrn; + private ResolvedEntitySpec resolvedUnauthorizedUserSpec; private Urn resourceUrn; @BeforeMethod @@ -68,29 +62,34 @@ public void setupTest() throws Exception { _entityClient = Mockito.mock(EntityClient.class); _policyEngine = new PolicyEngine(Mockito.mock(Authentication.class), _entityClient); - // Init mocks. - EntityResponse authorizedEntityResponse = createAuthorizedEntityResponse(); authorizedUserUrn = Urn.createFromString(AUTHORIZED_PRINCIPAL); + resolvedAuthorizedUserSpec = buildEntityResolvers(CORP_USER_ENTITY_NAME, AUTHORIZED_PRINCIPAL, + Collections.emptySet(), Collections.emptySet(), Collections.singleton(AUTHORIZED_GROUP)); + unauthorizedUserUrn = Urn.createFromString(UNAUTHORIZED_PRINCIPAL); + resolvedUnauthorizedUserSpec = buildEntityResolvers(CORP_USER_ENTITY_NAME, UNAUTHORIZED_PRINCIPAL); + resourceUrn = Urn.createFromString(RESOURCE_URN); + + // Init role membership mocks. + EntityResponse authorizedEntityResponse = createAuthorizedEntityResponse(); authorizedEntityResponse.setUrn(authorizedUserUrn); Map authorizedEntityResponseMap = Collections.singletonMap(authorizedUserUrn, authorizedEntityResponse); - when(_entityClient.batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), any(), - any())).thenReturn(authorizedEntityResponseMap); + when(_entityClient.batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), + eq(Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME)), any())).thenReturn(authorizedEntityResponseMap); EntityResponse unauthorizedEntityResponse = createUnauthorizedEntityResponse(); - unauthorizedUserUrn = Urn.createFromString(UNAUTHORIZED_PRINCIPAL); unauthorizedEntityResponse.setUrn(unauthorizedUserUrn); Map unauthorizedEntityResponseMap = Collections.singletonMap(unauthorizedUserUrn, unauthorizedEntityResponse); - when(_entityClient.batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(unauthorizedUserUrn)), any(), - any())).thenReturn(unauthorizedEntityResponseMap); + when(_entityClient.batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(unauthorizedUserUrn)), + eq(Collections.singleton(ROLE_MEMBERSHIP_ASPECT_NAME)), any())).thenReturn(unauthorizedEntityResponseMap); + // Init ownership type mocks. EntityResponse entityResponse = new EntityResponse(); EnvelopedAspectMap envelopedAspectMap = new EnvelopedAspectMap(); envelopedAspectMap.put(OWNERSHIP_ASPECT_NAME, new EnvelopedAspect().setValue(new com.linkedin.entity.Aspect(createOwnershipAspect(true, true).data()))); entityResponse.setAspects(envelopedAspectMap); - resourceUrn = Urn.createFromString(RESOURCE_URN); Map mockMap = mock(Map.class); when(_entityClient.batchGetV2(any(), eq(Collections.singleton(resourceUrn)), eq(Collections.singleton(OWNERSHIP_ASPECT_NAME)), any())).thenReturn(mockMap); @@ -120,9 +119,9 @@ public void testEvaluatePolicyInactivePolicyState() { resourceFilter.setAllResources(true); resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -149,9 +148,9 @@ public void testEvaluatePolicyPrivilegeFilterNoMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_OWNERS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_OWNERS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -176,7 +175,8 @@ public void testEvaluatePlatformPolicyPrivilegeFilterMatch() throws Exception { dataHubPolicyInfo.setActors(actorFilter); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "MANAGE_POLICIES", Optional.empty()); + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "MANAGE_POLICIES", + Optional.empty()); assertTrue(result.isGranted()); // Verify no network calls @@ -208,10 +208,10 @@ public void testEvaluatePolicyActorFilterUserMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert Authorized user can edit entity tags. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); @@ -245,10 +245,10 @@ public void testEvaluatePolicyActorFilterUserNoMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, "urn:li:corpuser:test", "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, buildEntityResolvers(CORP_USER_ENTITY_NAME, "urn:li:corpuser:test"), "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result2.isGranted()); @@ -270,7 +270,7 @@ public void testEvaluatePolicyActorFilterGroupMatch() throws Exception { final DataHubActorFilter actorFilter = new DataHubActorFilter(); final UrnArray groupsUrnArray = new UrnArray(); - groupsUrnArray.add(Urn.createFromString("urn:li:corpGroup:authorizedGroup")); + groupsUrnArray.add(Urn.createFromString(AUTHORIZED_GROUP)); actorFilter.setGroups(groupsUrnArray); actorFilter.setResourceOwners(false); actorFilter.setAllUsers(false); @@ -282,16 +282,15 @@ public void testEvaluatePolicyActorFilterGroupMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags, because of group membership. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); - // Verify we are only calling for group during these requests. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -307,7 +306,7 @@ public void testEvaluatePolicyActorFilterGroupNoMatch() throws Exception { final DataHubActorFilter actorFilter = new DataHubActorFilter(); final UrnArray groupsUrnArray = new UrnArray(); - groupsUrnArray.add(Urn.createFromString("urn:li:corpGroup:authorizedGroup")); + groupsUrnArray.add(Urn.createFromString(AUTHORIZED_GROUP)); actorFilter.setGroups(groupsUrnArray); actorFilter.setResourceOwners(false); actorFilter.setAllUsers(false); @@ -319,16 +318,15 @@ public void testEvaluatePolicyActorFilterGroupNoMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result2.isGranted()); - // Verify we are only calling for group during these requests. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), - eq(Collections.singleton(unauthorizedUserUrn)), any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -357,17 +355,17 @@ public void testEvaluatePolicyActorFilterRoleMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags. PolicyEngine.PolicyEvaluationResult authorizedResult = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(authorizedResult.isGranted()); // Verify we are only calling for roles during these requests. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - any(), any()); + verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), + eq(Collections.singleton(authorizedUserUrn)), any(), any()); } @Test @@ -396,10 +394,10 @@ public void testEvaluatePolicyActorFilterNoRoleMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags. PolicyEngine.PolicyEvaluationResult unauthorizedResult = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(unauthorizedResult.isGranted()); @@ -431,16 +429,16 @@ public void testEvaluatePolicyActorFilterAllUsersMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags, because of group membership. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result2.isGranted()); @@ -470,24 +468,21 @@ public void testEvaluatePolicyActorFilterAllGroupsMatch() throws Exception { resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert authorized user can edit entity tags, because of group membership. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); - assertTrue(result2.isGranted()); + assertFalse(result2.isGranted()); - // Verify we are only calling for group during these requests. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - any(), any()); - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), - eq(Collections.singleton(unauthorizedUserUrn)), any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -519,17 +514,17 @@ public void testEvaluatePolicyActorFilterUserResourceOwnersMatch() throws Except when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet(), + Collections.emptySet()); // Assert authorized user can edit entity tags, because he is a user owner. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); - // Ensure no calls for group membership. - verify(_entityClient, times(0)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - eq(null), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -562,13 +557,17 @@ public void testEvaluatePolicyActorFilterUserResourceOwnersTypeMatch() throws Ex when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet(), + Collections.emptySet()); PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); + + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -601,13 +600,16 @@ public void testEvaluatePolicyActorFilterUserResourceOwnersTypeNoMatch() throws when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL), Collections.emptySet(), Collections.emptySet()); PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result1.isGranted()); + + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -639,17 +641,17 @@ public void testEvaluatePolicyActorFilterGroupResourceOwnersMatch() throws Excep when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_GROUP), Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_GROUP), Collections.emptySet(), + Collections.emptySet()); // Assert authorized user can edit entity tags, because he is a user owner. PolicyEngine.PolicyEvaluationResult result1 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result1.isGranted()); - // Ensure that caching of groups is working with 1 call to entity client for each principal. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), eq(Collections.singleton(authorizedUserUrn)), - any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -673,16 +675,15 @@ public void testEvaluatePolicyActorFilterGroupResourceOwnersNoMatch() throws Exc resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); // Assert unauthorized user cannot edit entity tags. PolicyEngine.PolicyEvaluationResult result2 = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, UNAUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedUnauthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result2.isGranted()); - // Ensure that caching of groups is working with 1 call to entity client for each principal. - verify(_entityClient, times(1)).batchGetV2(eq(CORP_USER_ENTITY_NAME), - eq(Collections.singleton(unauthorizedUserUrn)), any(), any()); + // Verify no network calls + verify(_entityClient, times(0)).batchGetV2(any(), any(), any(), any()); } @Test @@ -706,10 +707,10 @@ public void testEvaluatePolicyResourceFilterAllResourcesMatch() throws Exception resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", "urn:li:dataset:random"); // A dataset Authorized principal _does not own_. + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", "urn:li:dataset:random"); // A dataset Authorized principal _does not own_. PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result.isGranted()); @@ -738,9 +739,9 @@ public void testEvaluatePolicyResourceFilterAllResourcesNoMatch() throws Excepti resourceFilter.setType("dataset"); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("chart", RESOURCE_URN); // Notice: Not a dataset. + ResolvedEntitySpec resourceSpec = buildEntityResolvers("chart", RESOURCE_URN); // Notice: Not a dataset. PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -773,9 +774,9 @@ public void testEvaluatePolicyResourceFilterSpecificResourceMatchLegacy() throws resourceFilter.setResources(resourceUrns); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result.isGranted()); @@ -801,13 +802,13 @@ public void testEvaluatePolicyResourceFilterSpecificResourceMatch() throws Excep final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); resourceFilter.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), - ResourceFieldType.RESOURCE_URN, Collections.singletonList(RESOURCE_URN)))); + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), + EntityFieldType.URN, Collections.singletonList(RESOURCE_URN)))); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN); + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result.isGranted()); @@ -833,14 +834,14 @@ public void testEvaluatePolicyResourceFilterSpecificResourceNoMatch() throws Exc final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); resourceFilter.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), - ResourceFieldType.RESOURCE_URN, Collections.singletonList(RESOURCE_URN)))); + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), + EntityFieldType.URN, Collections.singletonList(RESOURCE_URN)))); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -866,14 +867,14 @@ public void testEvaluatePolicyResourceFilterSpecificResourceMatchDomain() throws final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); resourceFilter.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), ResourceFieldType.DOMAIN, + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), EntityFieldType.DOMAIN, Collections.singletonList(DOMAIN_URN)))); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, Collections.emptySet(), Collections.singleton(DOMAIN_URN)); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, Collections.emptySet(), Collections.singleton(DOMAIN_URN), Collections.emptySet()); PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertTrue(result.isGranted()); @@ -899,14 +900,14 @@ public void testEvaluatePolicyResourceFilterSpecificResourceNoMatchDomain() thro final DataHubResourceFilter resourceFilter = new DataHubResourceFilter(); resourceFilter.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), ResourceFieldType.DOMAIN, + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), EntityFieldType.DOMAIN, Collections.singletonList(DOMAIN_URN)))); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN, Collections.emptySet(), - Collections.singleton("urn:li:domain:domain2")); // Domain doesn't match + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, Collections.emptySet(), + Collections.singleton("urn:li:domain:domain2"), Collections.emptySet()); // Domain doesn't match PolicyEngine.PolicyEvaluationResult result = - _policyEngine.evaluatePolicy(dataHubPolicyInfo, AUTHORIZED_PRINCIPAL, "EDIT_ENTITY_TAGS", + _policyEngine.evaluatePolicy(dataHubPolicyInfo, resolvedAuthorizedUserSpec, "EDIT_ENTITY_TAGS", Optional.of(resourceSpec)); assertFalse(result.isGranted()); @@ -933,7 +934,7 @@ public void testGetGrantedPrivileges() throws Exception { final DataHubResourceFilter resourceFilter1 = new DataHubResourceFilter(); resourceFilter1.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), ResourceFieldType.DOMAIN, + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), EntityFieldType.DOMAIN, Collections.singletonList(DOMAIN_URN)))); dataHubPolicyInfo1.setResources(resourceFilter1); @@ -954,8 +955,8 @@ public void testGetGrantedPrivileges() throws Exception { final DataHubResourceFilter resourceFilter2 = new DataHubResourceFilter(); resourceFilter2.setFilter(FilterUtils.newFilter( - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, Collections.singletonList("dataset"), - ResourceFieldType.RESOURCE_URN, Collections.singletonList(RESOURCE_URN)))); + ImmutableMap.of(EntityFieldType.TYPE, Collections.singletonList("dataset"), + EntityFieldType.URN, Collections.singletonList(RESOURCE_URN)))); dataHubPolicyInfo2.setResources(resourceFilter2); // Policy 3, match dataset type and owner (legacy resource filter) @@ -981,25 +982,25 @@ public void testGetGrantedPrivileges() throws Exception { final List policies = ImmutableList.of(dataHubPolicyInfo1, dataHubPolicyInfo2, dataHubPolicyInfo3); - assertEquals(_policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.empty()), + assertEquals(_policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.empty()), Collections.emptyList()); - ResolvedResourceSpec resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN, Collections.emptySet(), - Collections.singleton(DOMAIN_URN)); // Everything matches + ResolvedEntitySpec resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, Collections.emptySet(), + Collections.singleton(DOMAIN_URN), Collections.emptySet()); // Everything matches assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), ImmutableList.of("PRIVILEGE_1", "PRIVILEGE_2_1", "PRIVILEGE_2_2")); - resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN, Collections.emptySet(), - Collections.singleton("urn:li:domain:domain2")); // Domain doesn't match + resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, Collections.emptySet(), + Collections.singleton("urn:li:domain:domain2"), Collections.emptySet()); // Domain doesn't match assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), ImmutableList.of("PRIVILEGE_2_1", "PRIVILEGE_2_2")); - resourceSpec = buildResourceResolvers("dataset", "urn:li:dataset:random", Collections.emptySet(), - Collections.singleton(DOMAIN_URN)); // Resource doesn't match + resourceSpec = buildEntityResolvers("dataset", "urn:li:dataset:random", Collections.emptySet(), + Collections.singleton(DOMAIN_URN), Collections.emptySet()); // Resource doesn't match assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), ImmutableList.of("PRIVILEGE_1")); final EntityResponse entityResponse = new EntityResponse(); @@ -1008,16 +1009,16 @@ public void testGetGrantedPrivileges() throws Exception { entityResponse.setAspects(aspectMap); when(_entityClient.getV2(eq(resourceUrn.getEntityType()), eq(resourceUrn), eq(Collections.singleton(Constants.OWNERSHIP_ASPECT_NAME)), any())).thenReturn(entityResponse); - resourceSpec = buildResourceResolvers("dataset", RESOURCE_URN, Collections.singleton(AUTHORIZED_PRINCIPAL), - Collections.singleton(DOMAIN_URN)); // Is owner + resourceSpec = buildEntityResolvers("dataset", RESOURCE_URN, Collections.singleton(AUTHORIZED_PRINCIPAL), + Collections.singleton(DOMAIN_URN), Collections.emptySet()); // Is owner assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), ImmutableList.of("PRIVILEGE_1", "PRIVILEGE_2_1", "PRIVILEGE_2_2", "PRIVILEGE_3")); - resourceSpec = buildResourceResolvers("chart", RESOURCE_URN, Collections.singleton(AUTHORIZED_PRINCIPAL), - Collections.singleton(DOMAIN_URN)); // Resource type doesn't match + resourceSpec = buildEntityResolvers("chart", RESOURCE_URN, Collections.singleton(AUTHORIZED_PRINCIPAL), + Collections.singleton(DOMAIN_URN), Collections.emptySet()); // Resource type doesn't match assertEquals( - _policyEngine.getGrantedPrivileges(policies, UrnUtils.getUrn(AUTHORIZED_PRINCIPAL), Optional.of(resourceSpec)), + _policyEngine.getGrantedPrivileges(policies, resolvedAuthorizedUserSpec, Optional.of(resourceSpec)), Collections.emptyList()); } @@ -1050,9 +1051,9 @@ public void testGetMatchingActorsResourceMatch() throws Exception { resourceFilter.setResources(resourceUrns); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL, AUTHORIZED_GROUP), - Collections.emptySet()); + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", RESOURCE_URN, ImmutableSet.of(AUTHORIZED_PRINCIPAL, AUTHORIZED_GROUP), + Collections.emptySet(), Collections.emptySet()); PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); assertTrue(actors.allUsers()); @@ -1101,8 +1102,8 @@ public void testGetMatchingActorsNoResourceMatch() throws Exception { resourceFilter.setResources(resourceUrns); dataHubPolicyInfo.setResources(resourceFilter); - ResolvedResourceSpec resourceSpec = - buildResourceResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. + ResolvedEntitySpec resourceSpec = + buildEntityResolvers("dataset", "urn:li:dataset:random"); // A resource not covered by the policy. PolicyEngine.PolicyActors actors = _policyEngine.getMatchingActors(dataHubPolicyInfo, Optional.of(resourceSpec)); assertFalse(actors.allUsers()); @@ -1155,21 +1156,6 @@ private EntityResponse createAuthorizedEntityResponse() throws URISyntaxExceptio final EntityResponse entityResponse = new EntityResponse(); final EnvelopedAspectMap aspectMap = new EnvelopedAspectMap(); - final CorpUserInfo userInfo = new CorpUserInfo(); - userInfo.setActive(true); - userInfo.setFullName("Data Hub"); - userInfo.setFirstName("Data"); - userInfo.setLastName("Hub"); - userInfo.setEmail("datahub@gmail.com"); - userInfo.setTitle("Admin"); - aspectMap.put(CORP_USER_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(userInfo.data()))); - - final GroupMembership groupsAspect = new GroupMembership(); - final UrnArray groups = new UrnArray(); - groups.add(Urn.createFromString("urn:li:corpGroup:authorizedGroup")); - groupsAspect.setGroups(groups); - aspectMap.put(GROUP_MEMBERSHIP_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(groupsAspect.data()))); - final RoleMembership rolesAspect = new RoleMembership(); final UrnArray roles = new UrnArray(); roles.add(Urn.createFromString("urn:li:dataHubRole:admin")); @@ -1184,21 +1170,6 @@ private EntityResponse createUnauthorizedEntityResponse() throws URISyntaxExcept final EntityResponse entityResponse = new EntityResponse(); final EnvelopedAspectMap aspectMap = new EnvelopedAspectMap(); - final CorpUserInfo userInfo = new CorpUserInfo(); - userInfo.setActive(true); - userInfo.setFullName("Unauthorized User"); - userInfo.setFirstName("Unauthorized"); - userInfo.setLastName("User"); - userInfo.setEmail("Unauth"); - userInfo.setTitle("Engineer"); - aspectMap.put(CORP_USER_INFO_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(userInfo.data()))); - - final GroupMembership groupsAspect = new GroupMembership(); - final UrnArray groups = new UrnArray(); - groups.add(Urn.createFromString("urn:li:corpGroup:unauthorizedGroup")); - groupsAspect.setGroups(groups); - aspectMap.put(GROUP_MEMBERSHIP_ASPECT_NAME, new EnvelopedAspect().setValue(new Aspect(groupsAspect.data()))); - final RoleMembership rolesAspect = new RoleMembership(); final UrnArray roles = new UrnArray(); roles.add(Urn.createFromString("urn:li:dataHubRole:reader")); @@ -1209,17 +1180,18 @@ private EntityResponse createUnauthorizedEntityResponse() throws URISyntaxExcept return entityResponse; } - public static ResolvedResourceSpec buildResourceResolvers(String entityType, String entityUrn) { - return buildResourceResolvers(entityType, entityUrn, Collections.emptySet(), Collections.emptySet()); + public static ResolvedEntitySpec buildEntityResolvers(String entityType, String entityUrn) { + return buildEntityResolvers(entityType, entityUrn, Collections.emptySet(), Collections.emptySet(), Collections.emptySet()); } - public static ResolvedResourceSpec buildResourceResolvers(String entityType, String entityUrn, Set owners, - Set domains) { - return new ResolvedResourceSpec(new ResourceSpec(entityType, entityUrn), - ImmutableMap.of(ResourceFieldType.RESOURCE_TYPE, - FieldResolver.getResolverFromValues(Collections.singleton(entityType)), ResourceFieldType.RESOURCE_URN, - FieldResolver.getResolverFromValues(Collections.singleton(entityUrn)), ResourceFieldType.OWNER, - FieldResolver.getResolverFromValues(owners), ResourceFieldType.DOMAIN, - FieldResolver.getResolverFromValues(domains))); + public static ResolvedEntitySpec buildEntityResolvers(String entityType, String entityUrn, Set owners, + Set domains, Set groups) { + return new ResolvedEntitySpec(new EntitySpec(entityType, entityUrn), + ImmutableMap.of(EntityFieldType.TYPE, + FieldResolver.getResolverFromValues(Collections.singleton(entityType)), EntityFieldType.URN, + FieldResolver.getResolverFromValues(Collections.singleton(entityUrn)), EntityFieldType.OWNER, + FieldResolver.getResolverFromValues(owners), EntityFieldType.DOMAIN, + FieldResolver.getResolverFromValues(domains), EntityFieldType.GROUP_MEMBERSHIP, + FieldResolver.getResolverFromValues(groups))); } } diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java new file mode 100644 index 00000000000000..5c7d87f1c05a96 --- /dev/null +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/DataPlatformInstanceFieldResolverProviderTest.java @@ -0,0 +1,193 @@ +package com.datahub.authorization.fieldresolverprovider; + +import static com.linkedin.metadata.Constants.DATASET_ENTITY_NAME; +import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ASPECT_NAME; +import static com.linkedin.metadata.Constants.DATA_PLATFORM_INSTANCE_ENTITY_NAME; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.verifyZeroInteractions; +import static org.mockito.Mockito.when; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +import com.datahub.authentication.Authentication; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.linkedin.common.DataPlatformInstance; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.r2.RemoteInvocationException; +import java.net.URISyntaxException; +import java.util.Collections; +import java.util.Set; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +public class DataPlatformInstanceFieldResolverProviderTest { + + private static final String DATA_PLATFORM_INSTANCE_URN = + "urn:li:dataPlatformInstance:(urn:li:dataPlatform:s3,test-platform-instance)"; + private static final String RESOURCE_URN = + "urn:li:dataset:(urn:li:dataPlatform:s3,test-platform-instance.testDataset,PROD)"; + private static final EntitySpec RESOURCE_SPEC = new EntitySpec(DATASET_ENTITY_NAME, RESOURCE_URN); + + @Mock + private EntityClient entityClientMock; + @Mock + private Authentication systemAuthenticationMock; + + private DataPlatformInstanceFieldResolverProvider dataPlatformInstanceFieldResolverProvider; + + @BeforeMethod + public void setup() { + MockitoAnnotations.initMocks(this); + dataPlatformInstanceFieldResolverProvider = + new DataPlatformInstanceFieldResolverProvider(entityClientMock, systemAuthenticationMock); + } + + @Test + public void shouldReturnDataPlatformInstanceType() { + assertEquals(EntityFieldType.DATA_PLATFORM_INSTANCE, dataPlatformInstanceFieldResolverProvider.getFieldTypes().get(0)); + } + + @Test + public void shouldReturnFieldValueWithResourceSpecIfTypeIsDataPlatformInstance() { + var resourceSpec = new EntitySpec(DATA_PLATFORM_INSTANCE_ENTITY_NAME, DATA_PLATFORM_INSTANCE_URN); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(resourceSpec); + + assertEquals(Set.of(DATA_PLATFORM_INSTANCE_URN), result.getFieldValuesFuture().join().getValues()); + verifyZeroInteractions(entityClientMock); + } + + @Test + public void shouldReturnEmptyFieldValueWhenResponseIsNull() throws RemoteInvocationException, URISyntaxException { + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(null); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenResourceHasNoDataPlatformInstance() + throws RemoteInvocationException, URISyntaxException { + var entityResponseMock = mock(EntityResponse.class); + when(entityResponseMock.getAspects()).thenReturn(new EnvelopedAspectMap()); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenThereIsAnException() throws RemoteInvocationException, URISyntaxException { + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenThrow(new RemoteInvocationException()); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenDataPlatformInstanceHasNoInstance() + throws RemoteInvocationException, URISyntaxException { + + var dataPlatform = new DataPlatformInstance() + .setPlatform(Urn.createFromString("urn:li:dataPlatform:s3")); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(DATA_PLATFORM_INSTANCE_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(dataPlatform.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnFieldValueWithDataPlatformInstanceOfTheResource() + throws RemoteInvocationException, URISyntaxException { + + var dataPlatformInstance = new DataPlatformInstance() + .setPlatform(Urn.createFromString("urn:li:dataPlatform:s3")) + .setInstance(Urn.createFromString(DATA_PLATFORM_INSTANCE_URN)); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(DATA_PLATFORM_INSTANCE_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(dataPlatformInstance.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = dataPlatformInstanceFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertEquals(Set.of(DATA_PLATFORM_INSTANCE_URN), result.getFieldValuesFuture().join().getValues()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(Collections.singleton(DATA_PLATFORM_INSTANCE_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } +} diff --git a/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java new file mode 100644 index 00000000000000..af547f14cd3fcd --- /dev/null +++ b/metadata-service/auth-impl/src/test/java/com/datahub/authorization/fieldresolverprovider/GroupMembershipFieldResolverProviderTest.java @@ -0,0 +1,212 @@ +package com.datahub.authorization.fieldresolverprovider; + +import com.datahub.authentication.Authentication; +import com.datahub.authorization.EntityFieldType; +import com.datahub.authorization.EntitySpec; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableSet; +import com.linkedin.common.UrnArray; +import com.linkedin.common.urn.Urn; +import com.linkedin.entity.Aspect; +import com.linkedin.entity.EntityResponse; +import com.linkedin.entity.EnvelopedAspect; +import com.linkedin.entity.EnvelopedAspectMap; +import com.linkedin.entity.client.EntityClient; +import com.linkedin.identity.GroupMembership; +import com.linkedin.identity.NativeGroupMembership; +import com.linkedin.r2.RemoteInvocationException; +import org.mockito.Mock; +import org.mockito.MockitoAnnotations; +import org.testng.annotations.BeforeMethod; +import org.testng.annotations.Test; + +import java.net.URISyntaxException; +import java.util.Set; + +import static com.linkedin.metadata.Constants.*; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.*; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; + +public class GroupMembershipFieldResolverProviderTest { + + private static final String CORPGROUP_URN = "urn:li:corpGroup:groupname"; + private static final String NATIVE_CORPGROUP_URN = "urn:li:corpGroup:nativegroupname"; + private static final String RESOURCE_URN = "urn:li:dataset:(urn:li:dataPlatform:testPlatform,testDataset,PROD)"; + private static final EntitySpec RESOURCE_SPEC = new EntitySpec(DATASET_ENTITY_NAME, RESOURCE_URN); + + @Mock + private EntityClient entityClientMock; + @Mock + private Authentication systemAuthenticationMock; + + private GroupMembershipFieldResolverProvider groupMembershipFieldResolverProvider; + + @BeforeMethod + public void setup() { + MockitoAnnotations.initMocks(this); + groupMembershipFieldResolverProvider = + new GroupMembershipFieldResolverProvider(entityClientMock, systemAuthenticationMock); + } + + @Test + public void shouldReturnGroupsMembershipType() { + assertEquals(EntityFieldType.GROUP_MEMBERSHIP, groupMembershipFieldResolverProvider.getFieldTypes().get(0)); + } + + @Test + public void shouldReturnEmptyFieldValueWhenResponseIsNull() throws RemoteInvocationException, URISyntaxException { + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(null); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenResourceDoesNotBelongToAnyGroup() + throws RemoteInvocationException, URISyntaxException { + var entityResponseMock = mock(EntityResponse.class); + when(entityResponseMock.getAspects()).thenReturn(new EnvelopedAspectMap()); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnEmptyFieldValueWhenThereIsAnException() throws RemoteInvocationException, URISyntaxException { + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenThrow(new RemoteInvocationException()); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertTrue(result.getFieldValuesFuture().join().getValues().isEmpty()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnFieldValueWithOnlyGroupsOfTheResource() + throws RemoteInvocationException, URISyntaxException { + + var groupMembership = new GroupMembership().setGroups( + new UrnArray(ImmutableList.of(Urn.createFromString(CORPGROUP_URN)))); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(GROUP_MEMBERSHIP_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(groupMembership.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertEquals(Set.of(CORPGROUP_URN), result.getFieldValuesFuture().join().getValues()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnFieldValueWithOnlyNativeGroupsOfTheResource() + throws RemoteInvocationException, URISyntaxException { + + var nativeGroupMembership = new NativeGroupMembership().setNativeGroups( + new UrnArray(ImmutableList.of(Urn.createFromString(NATIVE_CORPGROUP_URN)))); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(nativeGroupMembership.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertEquals(Set.of(NATIVE_CORPGROUP_URN), result.getFieldValuesFuture().join().getValues()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } + + @Test + public void shouldReturnFieldValueWithGroupsAndNativeGroupsOfTheResource() + throws RemoteInvocationException, URISyntaxException { + + var groupMembership = new GroupMembership().setGroups( + new UrnArray(ImmutableList.of(Urn.createFromString(CORPGROUP_URN)))); + var nativeGroupMembership = new NativeGroupMembership().setNativeGroups( + new UrnArray(ImmutableList.of(Urn.createFromString(NATIVE_CORPGROUP_URN)))); + var entityResponseMock = mock(EntityResponse.class); + var envelopedAspectMap = new EnvelopedAspectMap(); + envelopedAspectMap.put(GROUP_MEMBERSHIP_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(groupMembership.data()))); + envelopedAspectMap.put(NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME, + new EnvelopedAspect().setValue(new Aspect(nativeGroupMembership.data()))); + when(entityResponseMock.getAspects()).thenReturn(envelopedAspectMap); + when(entityClientMock.getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + )).thenReturn(entityResponseMock); + + var result = groupMembershipFieldResolverProvider.getFieldResolver(RESOURCE_SPEC); + + assertEquals(Set.of(CORPGROUP_URN, NATIVE_CORPGROUP_URN), result.getFieldValuesFuture().join().getValues()); + verify(entityClientMock, times(1)).getV2( + eq(DATASET_ENTITY_NAME), + any(Urn.class), + eq(ImmutableSet.of(GROUP_MEMBERSHIP_ASPECT_NAME, NATIVE_GROUP_MEMBERSHIP_ASPECT_NAME)), + eq(systemAuthenticationMock) + ); + } +} \ No newline at end of file diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java new file mode 100644 index 00000000000000..7a93119226a2d2 --- /dev/null +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ConsumerConfiguration.java @@ -0,0 +1,10 @@ +package com.linkedin.metadata.config.kafka; + +import lombok.Data; + + +@Data +public class ConsumerConfiguration { + + private int maxPartitionFetchBytes; +} diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/KafkaConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/KafkaConfiguration.java index 2966abfc63396f..2345f88352c170 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/KafkaConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/KafkaConfiguration.java @@ -12,4 +12,6 @@ public class KafkaConfiguration { private SchemaRegistryConfiguration schemaRegistry; private ProducerConfiguration producer; + + private ConsumerConfiguration consumer; } diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ProducerConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ProducerConfiguration.java index 2bf4cea3f0c188..26a8c6b6491332 100644 --- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ProducerConfiguration.java +++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/kafka/ProducerConfiguration.java @@ -13,4 +13,8 @@ public class ProducerConfiguration { private int requestTimeout; private int backoffTimeout; + + private String compressionType; + + private int maxRequestSize; } diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml index 4be31b2b6bb151..b817208672e08b 100644 --- a/metadata-service/configuration/src/main/resources/application.yml +++ b/metadata-service/configuration/src/main/resources/application.yml @@ -25,6 +25,8 @@ authentication: # Key used to sign new tokens. signingKey: ${DATAHUB_TOKEN_SERVICE_SIGNING_KEY:WnEdIeTG/VVCLQqGwC/BAkqyY0k+H8NEAtWGejrBI94=} salt: ${DATAHUB_TOKEN_SERVICE_SALT:ohDVbJBvHHVJh9S/UA4BYF9COuNnqqVhr9MLKEGXk1O=} + issuer: ${DATAHUB_TOKEN_SERVICE_ISSUER:datahub-metadata-service} + signingAlgorithm: ${DATAHUB_TOKEN_SERVICE_SIGNING_ALGORITHM:HS256} # The max duration of a UI session in milliseconds. Defaults to 1 day. sessionTokenDurationMs: ${SESSION_TOKEN_DURATION_MS:86400000} @@ -226,6 +228,10 @@ kafka: deliveryTimeout: ${KAFKA_PRODUCER_DELIVERY_TIMEOUT:30000} requestTimeout: ${KAFKA_PRODUCER_REQUEST_TIMEOUT:3000} backoffTimeout: ${KAFKA_PRODUCER_BACKOFF_TIMEOUT:500} + compressionType: ${KAFKA_PRODUCER_COMPRESSION_TYPE:snappy} # producer's compression algorithm + maxRequestSize: ${KAFKA_PRODUCER_MAX_REQUEST_SIZE:5242880} # the max bytes sent by the producer, also see kafka-setup MAX_MESSAGE_BYTES for matching value + consumer: + maxPartitionFetchBytes: ${KAFKA_CONSUMER_MAX_PARTITION_FETCH_BYTES:5242880} # the max bytes consumed per partition schemaRegistry: type: ${SCHEMA_REGISTRY_TYPE:KAFKA} # INTERNAL or KAFKA or AWS_GLUE url: ${KAFKA_SCHEMAREGISTRY_URL:http://localhost:8081} @@ -276,6 +282,13 @@ bootstrap: enabled: ${UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED:false} # enable to run the upgrade to migrate legacy default browse paths to new ones backfillBrowsePathsV2: enabled: ${BACKFILL_BROWSE_PATHS_V2:false} # Enables running the backfill of browsePathsV2 upgrade step. There are concerns about the load of this step so hiding it behind a flag. Deprecating in favor of running through SystemUpdate + policies: + file: ${BOOTSTRAP_POLICIES_FILE:classpath:boot/policies.json} + # eg for local file + # file: "file:///datahub/datahub-gms/resources/custom-policies.json" + servlets: + waitTimeout: ${BOOTSTRAP_SERVLETS_WAITTIMEOUT:60} # Total waiting time in seconds for servlets to initialize + systemUpdate: initialBackOffMs: ${BOOTSTRAP_SYSTEM_UPDATE_INITIAL_BACK_OFF_MILLIS:5000} diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java index bf50a0c7b64734..b90257870a8b2c 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/AuthorizerChainFactory.java @@ -2,12 +2,12 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.DataHubAuthorizer; -import com.datahub.authorization.DefaultResourceSpecResolver; +import com.datahub.authorization.DefaultEntitySpecResolver; import com.datahub.plugins.PluginConstant; import com.datahub.authentication.Authentication; import com.datahub.plugins.auth.authorization.Authorizer; import com.datahub.authorization.AuthorizerContext; -import com.datahub.authorization.ResourceSpecResolver; +import com.datahub.authorization.EntitySpecResolver; import com.datahub.plugins.common.PluginConfig; import com.datahub.plugins.common.PluginPermissionManager; import com.datahub.plugins.common.PluginType; @@ -64,7 +64,7 @@ public class AuthorizerChainFactory { @Scope("singleton") @Nonnull protected AuthorizerChain getInstance() { - final ResourceSpecResolver resolver = initResolver(); + final EntitySpecResolver resolver = initResolver(); // Extract + initialize customer authorizers from application configs. final List authorizers = new ArrayList<>(initCustomAuthorizers(resolver)); @@ -79,11 +79,11 @@ protected AuthorizerChain getInstance() { return new AuthorizerChain(authorizers, dataHubAuthorizer); } - private ResourceSpecResolver initResolver() { - return new DefaultResourceSpecResolver(systemAuthentication, entityClient); + private EntitySpecResolver initResolver() { + return new DefaultEntitySpecResolver(systemAuthentication, entityClient); } - private List initCustomAuthorizers(ResourceSpecResolver resolver) { + private List initCustomAuthorizers(EntitySpecResolver resolver) { final List customAuthorizers = new ArrayList<>(); Path pluginBaseDirectory = Paths.get(configurationProvider.getDatahub().getPlugin().getAuth().getPath()); @@ -99,7 +99,7 @@ private List initCustomAuthorizers(ResourceSpecResolver resolver) { return customAuthorizers; } - private void registerAuthorizer(List customAuthorizers, ResourceSpecResolver resolver, Config config) { + private void registerAuthorizer(List customAuthorizers, EntitySpecResolver resolver, Config config) { PluginConfigFactory authorizerPluginPluginConfigFactory = new PluginConfigFactory(config); // Load only Authorizer configuration from plugin config factory List authorizers = diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java index 6b2a61882be90b..d47e1a0a734010 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/auth/DataHubTokenServiceFactory.java @@ -23,10 +23,10 @@ public class DataHubTokenServiceFactory { @Value("${authentication.tokenService.salt:}") private String saltingKey; - @Value("${elasticsearch.tokenService.signingAlgorithm:HS256}") + @Value("${authentication.tokenService.signingAlgorithm:HS256}") private String signingAlgorithm; - @Value("${elasticsearch.tokenService.issuer:datahub-metadata-service}") + @Value("${authentication.tokenService.issuer:datahub-metadata-service}") private String issuer; /** diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java index c67a2e704681fc..78b3de501e0e5a 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/DataHubKafkaProducerFactory.java @@ -59,6 +59,8 @@ public static Map buildProducerProperties(SchemaRegistryConfig s props.put(ProducerConfig.DELIVERY_TIMEOUT_MS_CONFIG, kafkaConfiguration.getProducer().getDeliveryTimeout()); props.put(ProducerConfig.REQUEST_TIMEOUT_MS_CONFIG, kafkaConfiguration.getProducer().getRequestTimeout()); props.put(ProducerConfig.RETRY_BACKOFF_MS_CONFIG, kafkaConfiguration.getProducer().getBackoffTimeout()); + props.put(ProducerConfig.COMPRESSION_TYPE_CONFIG, kafkaConfiguration.getProducer().getCompressionType()); + props.put(ProducerConfig.MAX_REQUEST_SIZE_CONFIG, kafkaConfiguration.getProducer().getMaxRequestSize()); // Override KafkaProperties with SchemaRegistryConfig only for non-empty values schemaRegistryConfig.getProperties().entrySet() diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java index ba18be6834d14d..7a9e80781d639a 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/KafkaEventConsumerFactory.java @@ -70,6 +70,7 @@ private static Map buildCustomizedProperties(KafkaProperties bas consumerProps.setEnableAutoCommit(true); consumerProps.setAutoCommitInterval(Duration.ofSeconds(10)); + // KAFKA_BOOTSTRAP_SERVER has precedence over SPRING_KAFKA_BOOTSTRAP_SERVERS if (kafkaConfiguration.getBootstrapServers() != null && kafkaConfiguration.getBootstrapServers().length() > 0) { consumerProps.setBootstrapServers(Arrays.asList(kafkaConfiguration.getBootstrapServers().split(","))); @@ -84,6 +85,9 @@ private static Map buildCustomizedProperties(KafkaProperties bas .filter(entry -> entry.getValue() != null && !entry.getValue().toString().isEmpty()) .forEach(entry -> customizedProperties.put(entry.getKey(), entry.getValue())); + customizedProperties.put(ConsumerConfig.MAX_PARTITION_FETCH_BYTES_CONFIG, + kafkaConfiguration.getConsumer().getMaxPartitionFetchBytes()); + return customizedProperties; } diff --git a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/SimpleKafkaConsumerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/SimpleKafkaConsumerFactory.java index 05ebfdddf8b805..e12cbec87fe451 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/SimpleKafkaConsumerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/gms/factory/kafka/SimpleKafkaConsumerFactory.java @@ -4,8 +4,11 @@ import com.linkedin.gms.factory.config.ConfigurationProvider; import java.time.Duration; import java.util.Arrays; +import java.util.Map; + import lombok.extern.slf4j.Slf4j; import org.apache.avro.generic.GenericRecord; +import org.apache.kafka.clients.consumer.ConsumerConfig; import org.apache.kafka.common.serialization.StringDeserializer; import org.springframework.beans.factory.annotation.Qualifier; import org.springframework.boot.autoconfigure.kafka.KafkaProperties; @@ -40,10 +43,14 @@ protected KafkaListenerContainerFactory createInstance(@Qualifier("configurat consumerProps.setBootstrapServers(Arrays.asList(kafkaConfiguration.getBootstrapServers().split(","))); } // else we rely on KafkaProperties which defaults to localhost:9092 + Map customizedProperties = consumerProps.buildProperties(); + customizedProperties.put(ConsumerConfig.MAX_PARTITION_FETCH_BYTES_CONFIG, + kafkaConfiguration.getConsumer().getMaxPartitionFetchBytes()); + ConcurrentKafkaListenerContainerFactory factory = new ConcurrentKafkaListenerContainerFactory<>(); factory.setContainerCustomizer(new ThreadPoolContainerCustomizer()); - factory.setConsumerFactory(new DefaultKafkaConsumerFactory<>(properties.buildConsumerProperties())); + factory.setConsumerFactory(new DefaultKafkaConsumerFactory<>(customizedProperties)); log.info("Simple KafkaListenerContainerFactory built successfully"); diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java index 980cafaceae27e..032b934a7ba87b 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/OnBootApplicationListener.java @@ -15,15 +15,18 @@ import org.apache.http.impl.client.HttpClients; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.beans.factory.annotation.Value; import org.springframework.context.event.ContextRefreshedEvent; import org.springframework.context.event.EventListener; import org.springframework.stereotype.Component; import org.springframework.web.context.WebApplicationContext; +import org.springframework.context.annotation.Configuration; /** * Responsible for coordinating starting steps that happen before the application starts up. */ +@Configuration @Slf4j @Component public class OnBootApplicationListener { @@ -44,6 +47,8 @@ public class OnBootApplicationListener { @Qualifier("configurationProvider") private ConfigurationProvider provider; + @Value("${bootstrap.servlets.waitTimeout}") + private int _servletsWaitTimeout; @EventListener(ContextRefreshedEvent.class) public void onApplicationEvent(@Nonnull ContextRefreshedEvent event) { @@ -62,7 +67,7 @@ public void onApplicationEvent(@Nonnull ContextRefreshedEvent event) { public Runnable isSchemaRegistryAPIServletReady() { return () -> { final HttpGet request = new HttpGet(provider.getKafka().getSchemaRegistry().getUrl()); - int timeouts = 30; + int timeouts = _servletsWaitTimeout; boolean openAPIServeletReady = false; while (!openAPIServeletReady && timeouts > 0) { try { @@ -79,7 +84,7 @@ public Runnable isSchemaRegistryAPIServletReady() { timeouts--; } if (!openAPIServeletReady) { - log.error("Failed to bootstrap DataHub, OpenAPI servlet was not ready after 30 seconds"); + log.error("Failed to bootstrap DataHub, OpenAPI servlet was not ready after {} seconds", timeouts); System.exit(1); } else { _bootstrapManager.start(); diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/BootstrapManagerFactory.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/BootstrapManagerFactory.java index c490f000212010..3a761bd12647e6 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/BootstrapManagerFactory.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/factories/BootstrapManagerFactory.java @@ -31,6 +31,7 @@ import com.linkedin.metadata.search.EntitySearchService; import com.linkedin.metadata.search.SearchService; import com.linkedin.metadata.search.transformer.SearchDocumentTransformer; + import java.util.ArrayList; import java.util.List; import javax.annotation.Nonnull; @@ -41,6 +42,7 @@ import org.springframework.context.annotation.Configuration; import org.springframework.context.annotation.Import; import org.springframework.context.annotation.Scope; +import org.springframework.core.io.Resource; @Configuration @@ -89,13 +91,16 @@ public class BootstrapManagerFactory { @Value("${bootstrap.backfillBrowsePathsV2.enabled}") private Boolean _backfillBrowsePathsV2Enabled; + @Value("${bootstrap.policies.file}") + private Resource _policiesResource; + @Bean(name = "bootstrapManager") @Scope("singleton") @Nonnull protected BootstrapManager createInstance() { final IngestRootUserStep ingestRootUserStep = new IngestRootUserStep(_entityService); final IngestPoliciesStep ingestPoliciesStep = - new IngestPoliciesStep(_entityRegistry, _entityService, _entitySearchService, _searchDocumentTransformer); + new IngestPoliciesStep(_entityRegistry, _entityService, _entitySearchService, _searchDocumentTransformer, _policiesResource); final IngestRolesStep ingestRolesStep = new IngestRolesStep(_entityService, _entityRegistry); final IngestDataPlatformsStep ingestDataPlatformsStep = new IngestDataPlatformsStep(_entityService); final IngestDataPlatformInstancesStep ingestDataPlatformInstancesStep = diff --git a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java index 87dcfd736da401..cf296452144664 100644 --- a/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java +++ b/metadata-service/factories/src/main/java/com/linkedin/metadata/boot/steps/IngestPoliciesStep.java @@ -25,6 +25,7 @@ import com.linkedin.mxe.GenericAspect; import com.linkedin.mxe.MetadataChangeProposal; import com.linkedin.policy.DataHubPolicyInfo; + import java.io.IOException; import java.net.URISyntaxException; import java.util.Collections; @@ -35,7 +36,8 @@ import lombok.RequiredArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.springframework.core.io.ClassPathResource; +import org.springframework.core.io.Resource; + import static com.linkedin.metadata.Constants.*; @@ -52,6 +54,8 @@ public class IngestPoliciesStep implements BootstrapStep { private final EntitySearchService _entitySearchService; private final SearchDocumentTransformer _searchDocumentTransformer; + private final Resource _policiesResource; + @Override public String name() { return "IngestPoliciesStep"; @@ -66,10 +70,10 @@ public void execute() throws IOException, URISyntaxException { .maxStringLength(maxSize).build()); // 0. Execute preflight check to see whether we need to ingest policies - log.info("Ingesting default access policies..."); + log.info("Ingesting default access policies from: {}...", _policiesResource); // 1. Read from the file into JSON. - final JsonNode policiesObj = mapper.readTree(new ClassPathResource("./boot/policies.json").getFile()); + final JsonNode policiesObj = mapper.readTree(_policiesResource.getFile()); if (!policiesObj.isArray()) { throw new RuntimeException( diff --git a/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/delegates/EntityApiDelegateImpl.java b/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/delegates/EntityApiDelegateImpl.java index ade49c876f1686..207c2284e2673c 100644 --- a/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/delegates/EntityApiDelegateImpl.java +++ b/metadata-service/openapi-entity-servlet/src/main/java/io/datahubproject/openapi/delegates/EntityApiDelegateImpl.java @@ -45,8 +45,7 @@ import io.datahubproject.openapi.util.OpenApiEntitiesUtil; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.linkedin.metadata.models.EntitySpec; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.linkedin.metadata.authorization.PoliciesConfig; import com.google.common.collect.ImmutableList; import com.datahub.authorization.AuthUtil; @@ -377,7 +376,7 @@ public ResponseEntity scroll(@Valid Boolean systemMetadata, @Valid List sort, @Valid SortOrder sortOrder, @Valid String query) { Authentication authentication = AuthenticationContext.getAuthentication(); - EntitySpec entitySpec = OpenApiEntitiesUtil.responseClassToEntitySpec(_entityRegistry, _respClazz); + com.linkedin.metadata.models.EntitySpec entitySpec = OpenApiEntitiesUtil.responseClassToEntitySpec(_entityRegistry, _respClazz); checkScrollAuthorized(authentication, entitySpec); // TODO multi-field sort @@ -410,12 +409,12 @@ public ResponseEntity scroll(@Valid Boolean systemMetadata, @Valid List> resourceSpecs = List.of(Optional.of(new ResourceSpec(entitySpec.getName(), ""))); + List> resourceSpecs = List.of(Optional.of(new EntitySpec(entitySpec.getName(), ""))); if (_restApiAuthorizationEnabled && !AuthUtil.isAuthorizedForResources(_authorizationChain, actorUrnStr, resourceSpecs, orGroup)) { throw new UnauthorizedException(actorUrnStr + " is unauthorized to get entities."); } diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/entities/EntitiesController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/entities/EntitiesController.java index 6439e2f31f7b00..898f768cf999a9 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/entities/EntitiesController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/entities/EntitiesController.java @@ -8,7 +8,7 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; @@ -93,8 +93,8 @@ public ResponseEntity getEntities( ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE.getType()) ))); - List> resourceSpecs = entityUrns.stream() - .map(urn -> Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + List> resourceSpecs = entityUrns.stream() + .map(urn -> Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (restApiAuthorizationEnabled && !AuthUtil.isAuthorizedForResources(_authorizerChain, actorUrnStr, resourceSpecs, orGroup)) { throw new UnauthorizedException(actorUrnStr + " is unauthorized to get entities."); @@ -175,8 +175,8 @@ public ResponseEntity> deleteEntities( .map(URLDecoder::decode) .map(UrnUtils::getUrn).collect(Collectors.toSet()); - List> resourceSpecs = entityUrns.stream() - .map(urn -> Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + List> resourceSpecs = entityUrns.stream() + .map(urn -> Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (restApiAuthorizationEnabled && !AuthUtil.isAuthorizedForResources(_authorizerChain, actorUrnStr, resourceSpecs, orGroup)) { UnauthorizedException unauthorizedException = new UnauthorizedException(actorUrnStr + " is unauthorized to delete entities."); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java index 1e37170f37b3b8..4641fed3a8610c 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/relationships/RelationshipsController.java @@ -8,7 +8,7 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; import com.linkedin.common.urn.UrnUtils; @@ -131,8 +131,8 @@ public ResponseEntity getRelationships( // Re-using GET_ENTITY_PRIVILEGE here as it doesn't make sense to split the privileges between these APIs. ))); - List> resourceSpecs = - Collections.singletonList(Optional.of(new ResourceSpec(entityUrn.getEntityType(), entityUrn.toString()))); + List> resourceSpecs = + Collections.singletonList(Optional.of(new EntitySpec(entityUrn.getEntityType(), entityUrn.toString()))); if (restApiAuthorizationEnabled && !AuthUtil.isAuthorizedForResources(_authorizerChain, actorUrnStr, resourceSpecs, orGroup)) { throw new UnauthorizedException(actorUrnStr + " is unauthorized to get relationships."); diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/timeline/TimelineController.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/timeline/TimelineController.java index 5a0ce2e314e1b7..fbde9e80720026 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/timeline/TimelineController.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/timeline/TimelineController.java @@ -6,7 +6,7 @@ import com.datahub.authorization.AuthorizerChain; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.fasterxml.jackson.core.JsonProcessingException; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; @@ -67,7 +67,7 @@ public ResponseEntity> getTimeline( Urn urn = Urn.createFromString(rawUrn); Authentication authentication = AuthenticationContext.getAuthentication(); String actorUrnStr = authentication.getActor().toUrnStr(); - ResourceSpec resourceSpec = new ResourceSpec(urn.getEntityType(), rawUrn); + EntitySpec resourceSpec = new EntitySpec(urn.getEntityType(), rawUrn); DisjunctivePrivilegeGroup orGroup = new DisjunctivePrivilegeGroup( ImmutableList.of(new ConjunctivePrivilegeGroup(ImmutableList.of(PoliciesConfig.GET_TIMELINE_PRIVILEGE.getType())))); if (restApiAuthorizationEnabled && !AuthUtil.isAuthorized(_authorizerChain, actorUrnStr, Optional.of(resourceSpec), orGroup)) { diff --git a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java index 2b3e84e2df20f8..21dc5a4c8a0d65 100644 --- a/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java +++ b/metadata-service/openapi-servlet/src/main/java/io/datahubproject/openapi/util/MappingUtil.java @@ -5,7 +5,7 @@ import com.datahub.authorization.AuthUtil; import com.datahub.plugins.auth.authorization.Authorizer; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; @@ -27,7 +27,6 @@ import com.linkedin.metadata.entity.ebean.transactions.AspectsBatchImpl; import com.linkedin.metadata.entity.transactions.AspectsBatch; import com.linkedin.metadata.entity.validation.ValidationException; -import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.entity.AspectUtils; import com.linkedin.metadata.utils.EntityKeyUtils; import com.linkedin.metadata.utils.metrics.MetricUtils; @@ -378,11 +377,11 @@ public static GenericAspect convertGenericAspect(@Nonnull io.datahubproject.open public static boolean authorizeProposals(List proposals, EntityService entityService, Authorizer authorizer, String actorUrnStr, DisjunctivePrivilegeGroup orGroup) { - List> resourceSpecs = proposals.stream() + List> resourceSpecs = proposals.stream() .map(proposal -> { - EntitySpec entitySpec = entityService.getEntityRegistry().getEntitySpec(proposal.getEntityType()); + com.linkedin.metadata.models.EntitySpec entitySpec = entityService.getEntityRegistry().getEntitySpec(proposal.getEntityType()); Urn entityUrn = EntityKeyUtils.getUrnFromProposal(proposal, entitySpec.getKeyAspectSpec()); - return Optional.of(new ResourceSpec(proposal.getEntityType(), entityUrn.toString())); + return Optional.of(new EntitySpec(proposal.getEntityType(), entityUrn.toString())); }) .collect(Collectors.toList()); return AuthUtil.isAuthorizedForResources(authorizer, actorUrnStr, resourceSpecs, orGroup); @@ -513,7 +512,7 @@ public static RollbackRunResultDto mapRollbackRunResult(RollbackRunResult rollba } public static UpsertAspectRequest createStatusRemoval(Urn urn, EntityService entityService) { - EntitySpec entitySpec = entityService.getEntityRegistry().getEntitySpec(urn.getEntityType()); + com.linkedin.metadata.models.EntitySpec entitySpec = entityService.getEntityRegistry().getEntitySpec(urn.getEntityType()); if (entitySpec == null || !entitySpec.getAspectSpecMap().containsKey(STATUS_ASPECT_NAME)) { throw new IllegalArgumentException("Entity type is not valid for soft deletes: " + urn.getEntityType()); } diff --git a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java index b6bc282f10b65e..442ac1b0d287b3 100644 --- a/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java +++ b/metadata-service/plugin/src/test/sample-test-plugins/src/main/java/com/datahub/plugins/test/TestAuthorizer.java @@ -4,7 +4,7 @@ import com.datahub.authorization.AuthorizationResult; import com.datahub.authorization.AuthorizedActors; import com.datahub.authorization.AuthorizerContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.PluginConstant; import com.datahub.plugins.auth.authorization.Authorizer; import java.io.BufferedReader; @@ -74,7 +74,7 @@ public AuthorizationResult authorize(@Nonnull AuthorizationRequest request) { } @Override - public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { + public AuthorizedActors authorizedActors(String privilege, Optional resourceSpec) { return new AuthorizedActors("ALL", null, null, true, true); } } diff --git a/metadata-service/restli-servlet-impl/build.gradle b/metadata-service/restli-servlet-impl/build.gradle index cb307863748c31..de6fb6690e693b 100644 --- a/metadata-service/restli-servlet-impl/build.gradle +++ b/metadata-service/restli-servlet-impl/build.gradle @@ -48,7 +48,7 @@ dependencies { implementation externalDependency.dropwizardMetricsCore implementation externalDependency.dropwizardMetricsJmx - compileOnly externalDependency.lombok + implementation externalDependency.lombok implementation externalDependency.neo4jJavaDriver implementation externalDependency.opentelemetryAnnotations diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java index 936c8bb67e6451..af76af90ce77fc 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/AspectResource.java @@ -3,7 +3,7 @@ import com.codahale.metrics.MetricRegistry; import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; @@ -20,7 +20,6 @@ import com.linkedin.metadata.entity.AspectUtils; import com.linkedin.metadata.entity.EntityService; import com.linkedin.metadata.entity.validation.ValidationException; -import com.linkedin.metadata.models.EntitySpec; import com.linkedin.metadata.query.filter.Filter; import com.linkedin.metadata.query.filter.SortCriterion; import com.linkedin.metadata.restli.RestliUtil; @@ -123,7 +122,7 @@ public Task get(@Nonnull String urnStr, @QueryParam("aspect") @Option Authentication authentication = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get aspect for " + urn); } final VersionedAspect aspect = _entityService.getVersionedAspect(urn, aspectName, version); @@ -154,7 +153,7 @@ public Task getTimeseriesAspectValues( Authentication authentication = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.GET_TIMESERIES_ASPECT_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get timeseries aspect for " + urn); } GetTimeseriesAspectValuesResponse response = new GetTimeseriesAspectValuesResponse(); @@ -193,11 +192,11 @@ public Task ingestProposal( } Authentication authentication = AuthenticationContext.getAuthentication(); - EntitySpec entitySpec = _entityService.getEntityRegistry().getEntitySpec(metadataChangeProposal.getEntityType()); + com.linkedin.metadata.models.EntitySpec entitySpec = _entityService.getEntityRegistry().getEntitySpec(metadataChangeProposal.getEntityType()); Urn urn = EntityKeyUtils.getUrnFromProposal(metadataChangeProposal, entitySpec.getKeyAspectSpec()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to modify entity " + urn); } String actorUrnStr = authentication.getActor().toUrnStr(); @@ -249,7 +248,7 @@ public Task getCount(@ActionParam(PARAM_ASPECT) @Nonnull String aspectN Authentication authentication = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), - (ResourceSpec) null)) { + (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get aspect counts."); } return _entityService.getCountAspect(aspectName, urnLike); diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java index 3ff22fb7676760..9bab846d1bdcc8 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/BatchIngestionRunResource.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.AuditStamp; import com.linkedin.common.urn.Urn; @@ -123,9 +123,9 @@ public Task rollback(@ActionParam("runId") @Nonnull String run List aspectRowsToDelete; aspectRowsToDelete = _systemMetadataService.findByRunId(runId, doHardDelete, 0, ESUtils.MAX_RESULT_SIZE); Set urns = aspectRowsToDelete.stream().collect(Collectors.groupingBy(AspectRowSummary::getUrn)).keySet(); - List> resourceSpecs = urns.stream() + List> resourceSpecs = urns.stream() .map(UrnUtils::getUrn) - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java index f6dedfb9a07c61..3ee98b32447180 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityResource.java @@ -3,7 +3,7 @@ import com.codahale.metrics.MetricRegistry; import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.linkedin.common.AuditStamp; @@ -173,7 +173,7 @@ public Task get(@Nonnull String urnStr, final Urn urn = Urn.createFromString(urnStr); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), new ResourceSpec(urn.getEntityType(), urnStr))) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), new EntitySpec(urn.getEntityType(), urnStr))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity " + urn); } @@ -198,8 +198,8 @@ public Task> batchGet(@Nonnull Set urnStrs, for (final String urnStr : urnStrs) { urns.add(Urn.createFromString(urnStr)); } - List> resourceSpecs = urns.stream() - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + List> resourceSpecs = urns.stream() + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) @@ -242,7 +242,7 @@ public Task ingest(@ActionParam(PARAM_ENTITY) @Nonnull Entity entity, final Urn urn = com.datahub.util.ModelUtils.getUrnFromSnapshotUnion(entity.getValue()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to edit entity " + urn); } @@ -273,10 +273,10 @@ public Task batchIngest(@ActionParam(PARAM_ENTITIES) @Nonnull Entity[] ent Authentication authentication = AuthenticationContext.getAuthentication(); String actorUrnStr = authentication.getActor().toUrnStr(); - List> resourceSpecs = Arrays.stream(entities) + List> resourceSpecs = Arrays.stream(entities) .map(Entity::getValue) .map(com.datahub.util.ModelUtils::getUrnFromSnapshotUnion) - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), resourceSpecs)) { @@ -322,7 +322,7 @@ public Task search(@ActionParam(PARAM_ENTITY) @Nonnull String enti @Optional @Nullable @ActionParam(PARAM_SEARCH_FLAGS) SearchFlags searchFlags) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -347,7 +347,7 @@ public Task searchAcrossEntities(@ActionParam(PARAM_ENTITIES) @Opt @ActionParam(PARAM_COUNT) int count, @ActionParam(PARAM_SEARCH_FLAGS) @Optional SearchFlags searchFlags) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -391,7 +391,7 @@ public Task searchAcrossLineage(@ActionParam(PARAM_URN) @No @Optional @Nullable @ActionParam(PARAM_SEARCH_FLAGS) SearchFlags searchFlags) throws URISyntaxException { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -443,7 +443,7 @@ public Task list(@ActionParam(PARAM_ENTITY) @Nonnull String entityNa Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -462,7 +462,7 @@ public Task autocomplete(@ActionParam(PARAM_ENTITY) @Nonnull Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -479,7 +479,7 @@ public Task browse(@ActionParam(PARAM_ENTITY) @Nonnull String enti Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -497,7 +497,7 @@ public Task getBrowsePaths( Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity: " + urn); } @@ -546,9 +546,9 @@ public Task deleteEntities(@ActionParam("registryId") @Optiona log.info("found {} rows to delete...", stringifyRowCount(aspectRowsToDelete.size())); response.setAspectsAffected(aspectRowsToDelete.size()); Set urns = aspectRowsToDelete.stream().collect(Collectors.groupingBy(AspectRowSummary::getUrn)).keySet(); - List> resourceSpecs = urns.stream() + List> resourceSpecs = urns.stream() .map(UrnUtils::getUrn) - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) @@ -590,7 +590,7 @@ public Task deleteEntity(@ActionParam(PARAM_URN) @Nonnull Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.DELETE_ENTITY_PRIVILEGE), - Collections.singletonList(java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))))) { + Collections.singletonList(java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to delete entity: " + urnStr); } @@ -638,7 +638,7 @@ private Long deleteTimeseriesAspects(@Nonnull Urn urn, @Nullable Long startTimeM Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.DELETE_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urn.toString()))) { + new EntitySpec(urn.getEntityType(), urn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to delete entity " + urn); } @@ -678,7 +678,7 @@ public Task deleteReferencesTo(@ActionParam(PARAM_URN) Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.DELETE_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urnStr))) { + new EntitySpec(urn.getEntityType(), urnStr))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to delete entity " + urnStr); } @@ -695,7 +695,7 @@ public Task deleteReferencesTo(@ActionParam(PARAM_URN) public Task setWriteable(@ActionParam(PARAM_VALUE) @Optional("true") @Nonnull Boolean value) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SET_WRITEABLE_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SET_WRITEABLE_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to enable and disable write mode."); } @@ -712,7 +712,7 @@ public Task setWriteable(@ActionParam(PARAM_VALUE) @Optional("true") @Nonn public Task getTotalEntityCount(@ActionParam(PARAM_ENTITY) @Nonnull String entityName) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity counts."); } @@ -725,7 +725,7 @@ public Task getTotalEntityCount(@ActionParam(PARAM_ENTITY) @Nonnull String public Task batchGetTotalEntityCount(@ActionParam(PARAM_ENTITIES) @Nonnull String[] entityNames) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_COUNTS_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity counts."); } @@ -739,7 +739,7 @@ public Task listUrns(@ActionParam(PARAM_ENTITY) @Nonnull String @ActionParam(PARAM_START) int start, @ActionParam(PARAM_COUNT) int count) throws URISyntaxException { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -757,10 +757,10 @@ public Task applyRetention(@ActionParam(PARAM_START) @Optional @Nullable @ActionParam(PARAM_URN) @Optional @Nullable String urn ) { Authentication auth = AuthenticationContext.getAuthentication(); - ResourceSpec resourceSpec = null; + EntitySpec resourceSpec = null; if (StringUtils.isNotBlank(urn)) { Urn resource = UrnUtils.getUrn(urn); - resourceSpec = new ResourceSpec(resource.getEntityType(), resource.toString()); + resourceSpec = new EntitySpec(resource.getEntityType(), resource.toString()); } if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.APPLY_RETENTION_PRIVILEGE), resourceSpec)) { @@ -781,7 +781,7 @@ public Task filter(@ActionParam(PARAM_ENTITY) @Nonnull String enti Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.SEARCH_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to search."); } @@ -799,7 +799,7 @@ public Task exists(@ActionParam(PARAM_URN) @Nonnull String urnStr) thro Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - new ResourceSpec(urn.getEntityType(), urnStr))) { + new EntitySpec(urn.getEntityType(), urnStr))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized get entity: " + urnStr); } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java index 7efb93c0f50e63..0c3e93273b863a 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityV2Resource.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; import com.linkedin.entity.EntityResponse; @@ -68,7 +68,7 @@ public Task get(@Nonnull String urnStr, final Urn urn = Urn.createFromString(urnStr); Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), new ResourceSpec(urn.getEntityType(), urnStr))) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), new EntitySpec(urn.getEntityType(), urnStr))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity " + urn); } @@ -96,8 +96,8 @@ public Task> batchGet(@Nonnull Set urnStrs, urns.add(Urn.createFromString(urnStr)); } Authentication auth = AuthenticationContext.getAuthentication(); - List> resourceSpecs = urns.stream() - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + List> resourceSpecs = urns.stream() + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), resourceSpecs)) { diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java index fd5c3507b54089..05b7e6b3ff24bd 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/entity/EntityVersionedV2Resource.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.VersionedUrn; import com.linkedin.common.urn.Urn; @@ -65,9 +65,9 @@ public Task> batchGetVersioned( @QueryParam(PARAM_ENTITY_TYPE) @Nonnull String entityType, @QueryParam(PARAM_ASPECTS) @Optional @Nullable String[] aspectNames) { Authentication auth = AuthenticationContext.getAuthentication(); - List> resourceSpecs = versionedUrnStrs.stream() + List> resourceSpecs = versionedUrnStrs.stream() .map(versionedUrn -> UrnUtils.getUrn(versionedUrn.getUrn())) - .map(urn -> java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))) + .map(urn -> java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))) .collect(Collectors.toList()); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), resourceSpecs)) { diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java index 313d16333f9e96..4a8e74c89039af 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/lineage/Relationships.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.common.EntityRelationship; import com.linkedin.common.EntityRelationshipArray; @@ -107,7 +107,7 @@ public Task get(@QueryParam("urn") @Nonnull String rawUrn, Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - Collections.singletonList(java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))))) { + Collections.singletonList(java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity lineage: " + rawUrn); } @@ -142,7 +142,7 @@ public UpdateResponse delete(@QueryParam("urn") @Nonnull String rawUrn) throws E Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.DELETE_ENTITY_PRIVILEGE), - Collections.singletonList(java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))))) { + Collections.singletonList(java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to delete entity: " + rawUrn); } @@ -162,7 +162,7 @@ public Task getLineage(@ActionParam(PARAM_URN) @Nonnull Str Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.GET_ENTITY_PRIVILEGE), - Collections.singletonList(java.util.Optional.of(new ResourceSpec(urn.getEntityType(), urn.toString()))))) { + Collections.singletonList(java.util.Optional.of(new EntitySpec(urn.getEntityType(), urn.toString()))))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to get entity lineage: " + urnStr); } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java index 188e5ae18ee8f5..12586b66495a92 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/operations/Utils.java @@ -2,7 +2,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.linkedin.common.urn.Urn; @@ -37,10 +37,10 @@ public static String restoreIndices( @Nonnull EntityService entityService ) { Authentication authentication = AuthenticationContext.getAuthentication(); - ResourceSpec resourceSpec = null; + EntitySpec resourceSpec = null; if (StringUtils.isNotBlank(urn)) { Urn resource = UrnUtils.getUrn(urn); - resourceSpec = new ResourceSpec(resource.getEntityType(), resource.toString()); + resourceSpec = new EntitySpec(resource.getEntityType(), resource.toString()); } if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(authentication, authorizer, ImmutableList.of(PoliciesConfig.RESTORE_INDICES_PRIVILEGE), diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java index f36841bb4abaea..a8018074497c44 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/platform/PlatformResource.java @@ -3,7 +3,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.google.common.collect.ImmutableList; import com.linkedin.entity.Entity; import com.linkedin.metadata.authorization.PoliciesConfig; @@ -54,7 +54,7 @@ public Task producePlatformEvent( @ActionParam("event") @Nonnull PlatformEvent event) { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.PRODUCE_PLATFORM_EVENT_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.PRODUCE_PLATFORM_EVENT_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to produce platform events."); } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java index 5c3b90a84aec1c..9949556c99b81d 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/restli/RestliUtils.java @@ -4,7 +4,7 @@ import com.datahub.authorization.AuthUtil; import com.datahub.authorization.ConjunctivePrivilegeGroup; import com.datahub.authorization.DisjunctivePrivilegeGroup; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.datahub.plugins.auth.authorization.Authorizer; import com.google.common.collect.ImmutableList; import com.linkedin.metadata.authorization.PoliciesConfig; @@ -82,13 +82,13 @@ public static RestLiServiceException invalidArgumentsException(@Nullable String } public static boolean isAuthorized(@Nonnull Authentication authentication, @Nonnull Authorizer authorizer, - @Nonnull final List privileges, @Nonnull final List> resources) { + @Nonnull final List privileges, @Nonnull final List> resources) { DisjunctivePrivilegeGroup orGroup = convertPrivilegeGroup(privileges); return AuthUtil.isAuthorizedForResources(authorizer, authentication.getActor().toUrnStr(), resources, orGroup); } public static boolean isAuthorized(@Nonnull Authentication authentication, @Nonnull Authorizer authorizer, - @Nonnull final List privileges, @Nullable final ResourceSpec resource) { + @Nonnull final List privileges, @Nullable final EntitySpec resource) { DisjunctivePrivilegeGroup orGroup = convertPrivilegeGroup(privileges); return AuthUtil.isAuthorized(authorizer, authentication.getActor().toUrnStr(), java.util.Optional.ofNullable(resource), orGroup); } diff --git a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java index be70cf9c494eff..02d413301f3b4e 100644 --- a/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java +++ b/metadata-service/restli-servlet-impl/src/main/java/com/linkedin/metadata/resources/usage/UsageStats.java @@ -4,7 +4,7 @@ import com.datahub.authentication.Authentication; import com.datahub.authentication.AuthenticationContext; import com.datahub.plugins.auth.authorization.Authorizer; -import com.datahub.authorization.ResourceSpec; +import com.datahub.authorization.EntitySpec; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.StreamReadConstraints; import com.fasterxml.jackson.databind.JsonNode; @@ -125,7 +125,7 @@ public Task batchIngest(@ActionParam(PARAM_BUCKETS) @Nonnull UsageAggregat return RestliUtil.toTask(() -> { Authentication auth = AuthenticationContext.getAuthentication(); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) - && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), (ResourceSpec) null)) { + && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.EDIT_ENTITY_PRIVILEGE), (EntitySpec) null)) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to edit entities."); } @@ -323,7 +323,7 @@ public Task query(@ActionParam(PARAM_RESOURCE) @Nonnull String Urn resourceUrn = UrnUtils.getUrn(resource); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE), - new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString()))) { + new EntitySpec(resourceUrn.getEntityType(), resourceUrn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to query usage."); } @@ -383,7 +383,7 @@ public Task queryRange(@ActionParam(PARAM_RESOURCE) @Nonnull S Urn resourceUrn = UrnUtils.getUrn(resource); if (Boolean.parseBoolean(System.getenv(REST_API_AUTHORIZATION_ENABLED_ENV)) && !isAuthorized(auth, _authorizer, ImmutableList.of(PoliciesConfig.VIEW_DATASET_USAGE_PRIVILEGE), - new ResourceSpec(resourceUrn.getEntityType(), resourceUrn.toString()))) { + new EntitySpec(resourceUrn.getEntityType(), resourceUrn.toString()))) { throw new RestLiServiceException(HttpStatus.S_401_UNAUTHORIZED, "User is unauthorized to query usage."); } diff --git a/metadata-service/services/build.gradle b/metadata-service/services/build.gradle index 22c62af324c12d..b6af3d330d185b 100644 --- a/metadata-service/services/build.gradle +++ b/metadata-service/services/build.gradle @@ -9,9 +9,9 @@ dependencies { implementation externalDependency.jsonPatch implementation project(':entity-registry') implementation project(':metadata-utils') - implementation project(':metadata-events:mxe-avro-1.7') + implementation project(':metadata-events:mxe-avro') implementation project(':metadata-events:mxe-registration') - implementation project(':metadata-events:mxe-utils-avro-1.7') + implementation project(':metadata-events:mxe-utils-avro') implementation project(':metadata-models') implementation project(':metadata-service:restli-client') implementation project(':metadata-service:configuration') diff --git a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java index a46b58aabfb0b2..64f59780b887f3 100644 --- a/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java +++ b/metadata-service/services/src/main/java/com/linkedin/metadata/search/EntitySearchService.java @@ -188,11 +188,12 @@ BrowseResult browse(@Nonnull String entityName, @Nonnull String path, @Nullable * @param sortCriterion {@link SortCriterion} to be applied to search results * @param scrollId opaque scroll identifier to pass to search service * @param size the number of search hits to return + * @param searchFlags flags controlling search options * @return a {@link ScrollResult} that contains a list of matched documents and related search result metadata */ @Nonnull ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size); + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size, @Nullable SearchFlags searchFlags); /** * Gets a list of documents that match given search request. The results are aggregated and filters are applied to the @@ -204,11 +205,12 @@ ScrollResult fullTextScroll(@Nonnull List entities, @Nonnull String inpu * @param sortCriterion {@link SortCriterion} to be applied to search results * @param scrollId opaque scroll identifier to pass to search service * @param size the number of search hits to return + * @param searchFlags flags controlling search options * @return a {@link ScrollResult} that contains a list of matched documents and related search result metadata */ @Nonnull ScrollResult structuredScroll(@Nonnull List entities, @Nonnull String input, @Nullable Filter postFilters, - @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size); + @Nullable SortCriterion sortCriterion, @Nullable String scrollId, @Nonnull String keepAlive, int size, @Nullable SearchFlags searchFlags); /** * Max result size returned by the underlying search backend diff --git a/metadata-service/war/src/main/resources/boot/policies.json b/metadata-service/war/src/main/resources/boot/policies.json index 410596cc30cbed..b7ffc11c08f055 100644 --- a/metadata-service/war/src/main/resources/boot/policies.json +++ b/metadata-service/war/src/main/resources/boot/policies.json @@ -56,7 +56,7 @@ "EDIT_ENTITY", "VIEW_ENTITY_PAGE", "EDIT_LINEAGE", - "EDIT_ENTITY_ASSERTIONS_PRIVILEGE", + "EDIT_ENTITY_ASSERTIONS", "SEARCH_PRIVILEGE", "GET_COUNTS_PRIVILEGE", "GET_TIMESERIES_ASPECT_PRIVILEGE", @@ -64,7 +64,8 @@ "GET_TIMELINE_PRIVILEGE", "PRODUCE_PLATFORM_EVENT_PRIVILEGE", "MANAGE_DATA_PRODUCTS", - "MANAGE_GLOBAL_OWNERSHIP_TYPES" + "MANAGE_GLOBAL_OWNERSHIP_TYPES", + "DELETE_ENTITY" ], "displayName":"Root User - Edit and View All Resources", "description":"Grants full edit and view privileges for all resources to root 'datahub' root user.", @@ -250,11 +251,6 @@ "EDIT_GROUP_MEMBERS", "EDIT_USER_PROFILE", "EDIT_CONTACT_INFO", - "MANAGE_ENTITY_TAGS_PRIVILEGE", - "MANAGE_ENTITY_GLOSSARY_TERMS_PRIVILEGE", - "MANAGE_DATASET_COL_GLOSSARY_TERMS_PRIVILEGE", - "MANAGE_DATASET_COL_TAGS_PRIVILEGE", - "EDIT_ENTITY_ASSERTIONS_PRIVILEGE", "EDIT_LINEAGE", "EDIT_ENTITY_QUERIES", "SEARCH_PRIVILEGE", @@ -263,7 +259,8 @@ "GET_ENTITY_PRIVILEGE", "GET_TIMELINE_PRIVILEGE", "PRODUCE_PLATFORM_EVENT_PRIVILEGE", - "MANAGE_DATA_PRODUCTS" + "MANAGE_DATA_PRODUCTS", + "DELETE_ENTITY" ], "displayName":"Admins - Metadata Policy", "description":"Admins have all metadata privileges.", @@ -334,11 +331,6 @@ "EDIT_GROUP_MEMBERS", "EDIT_USER_PROFILE", "EDIT_CONTACT_INFO", - "MANAGE_ENTITY_TAGS_PRIVILEGE", - "MANAGE_ENTITY_GLOSSARY_TERMS_PRIVILEGE", - "MANAGE_DATASET_COL_GLOSSARY_TERMS_PRIVILEGE", - "MANAGE_DATASET_COL_TAGS_PRIVILEGE", - "EDIT_ENTITY_ASSERTIONS_PRIVILEGE", "EDIT_LINEAGE", "EDIT_ENTITY_QUERIES", "SEARCH_PRIVILEGE", @@ -439,11 +431,6 @@ "EDIT_GROUP_MEMBERS", "EDIT_USER_PROFILE", "EDIT_CONTACT_INFO", - "MANAGE_ENTITY_TAGS_PRIVILEGE", - "MANAGE_ENTITY_GLOSSARY_TERMS_PRIVILEGE", - "MANAGE_DATASET_COL_GLOSSARY_TERMS_PRIVILEGE", - "MANAGE_DATASET_COL_TAGS_PRIVILEGE", - "EDIT_ENTITY_ASSERTIONS_PRIVILEGE", "EDIT_LINEAGE", "EDIT_ENTITY_QUERIES", "GET_TIMELINE_PRIVILEGE", diff --git a/metadata-utils/build.gradle b/metadata-utils/build.gradle index 1c1c368611488f..7bc6aa2d434424 100644 --- a/metadata-utils/build.gradle +++ b/metadata-utils/build.gradle @@ -1,7 +1,7 @@ apply plugin: 'java-library' dependencies { - api externalDependency.avro_1_7 + api externalDependency.avro implementation externalDependency.commonsLang api externalDependency.dropwizardMetricsCore implementation externalDependency.dropwizardMetricsJmx @@ -16,8 +16,8 @@ dependencies { api project(':li-utils') api project(':entity-registry') - api project(':metadata-events:mxe-avro-1.7') - api project(':metadata-events:mxe-utils-avro-1.7') + api project(':metadata-events:mxe-avro') + api project(':metadata-events:mxe-utils-avro') implementation externalDependency.slf4jApi compileOnly externalDependency.lombok diff --git a/settings.gradle b/settings.gradle index d6777b07b3fb3c..52de461383b5e8 100644 --- a/settings.gradle +++ b/settings.gradle @@ -20,10 +20,10 @@ include 'metadata-service:openapi-analytics-servlet' include 'metadata-service:plugin' include 'metadata-service:plugin:src:test:sample-test-plugins' include 'metadata-dao-impl:kafka-producer' -include 'metadata-events:mxe-avro-1.7' +include 'metadata-events:mxe-avro' include 'metadata-events:mxe-registration' include 'metadata-events:mxe-schemas' -include 'metadata-events:mxe-utils-avro-1.7' +include 'metadata-events:mxe-utils-avro' include 'metadata-ingestion' include 'metadata-jobs:mae-consumer' include 'metadata-jobs:mce-consumer' diff --git a/smoke-test/tests/assertions/assertions_test.py b/smoke-test/tests/assertions/assertions_test.py index 4aa64c512f6841..48f3564e6cd971 100644 --- a/smoke-test/tests/assertions/assertions_test.py +++ b/smoke-test/tests/assertions/assertions_test.py @@ -2,28 +2,29 @@ import urllib import pytest -import requests_wrapper as requests import tenacity from datahub.emitter.mce_builder import make_dataset_urn, make_schema_field_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext, RecordEnvelope from datahub.ingestion.api.sink import NoopWriteCallback from datahub.ingestion.sink.file import FileSink, FileSinkConfig -from datahub.metadata.com.linkedin.pegasus2avro.assertion import AssertionStdAggregation -from datahub.metadata.schema_classes import ( - AssertionInfoClass, - AssertionResultClass, - AssertionResultTypeClass, - AssertionRunEventClass, - AssertionRunStatusClass, - AssertionStdOperatorClass, - AssertionTypeClass, - DatasetAssertionInfoClass, - DatasetAssertionScopeClass, - PartitionSpecClass, - PartitionTypeClass, -) -from tests.utils import delete_urns_from_file, get_gms_url, ingest_file_via_rest, wait_for_healthcheck_util, get_sleep_info +from datahub.metadata.com.linkedin.pegasus2avro.assertion import \ + AssertionStdAggregation +from datahub.metadata.schema_classes import (AssertionInfoClass, + AssertionResultClass, + AssertionResultTypeClass, + AssertionRunEventClass, + AssertionRunStatusClass, + AssertionStdOperatorClass, + AssertionTypeClass, + DatasetAssertionInfoClass, + DatasetAssertionScopeClass, + PartitionSpecClass, + PartitionTypeClass) + +import requests_wrapper as requests +from tests.utils import (delete_urns_from_file, get_gms_url, get_sleep_info, + ingest_file_via_rest, wait_for_healthcheck_util) restli_default_headers = { "X-RestLi-Protocol-Version": "2.0.0", diff --git a/smoke-test/tests/browse/browse_test.py b/smoke-test/tests/browse/browse_test.py index b9d2143d13ec70..550f0062d5a398 100644 --- a/smoke-test/tests/browse/browse_test.py +++ b/smoke-test/tests/browse/browse_test.py @@ -1,9 +1,10 @@ import time import pytest -import requests_wrapper as requests -from tests.utils import delete_urns_from_file, get_frontend_url, ingest_file_via_rest +import requests_wrapper as requests +from tests.utils import (delete_urns_from_file, get_frontend_url, + ingest_file_via_rest) TEST_DATASET_1_URN = "urn:li:dataset:(urn:li:dataPlatform:kafka,test-browse-1,PROD)" TEST_DATASET_2_URN = "urn:li:dataset:(urn:li:dataPlatform:kafka,test-browse-2,PROD)" @@ -51,7 +52,9 @@ def test_get_browse_paths(frontend_session, ingest_cleanup_data): # /prod -- There should be one entity get_browse_paths_json = { "query": get_browse_paths_query, - "variables": {"input": { "type": "DATASET", "path": ["prod"], "start": 0, "count": 100 } }, + "variables": { + "input": {"type": "DATASET", "path": ["prod"], "start": 0, "count": 100} + }, } response = frontend_session.post( @@ -67,12 +70,19 @@ def test_get_browse_paths(frontend_session, ingest_cleanup_data): browse = res_data["data"]["browse"] print(browse) - assert browse["entities"] == [{ "urn": TEST_DATASET_3_URN }] + assert browse["entities"] == [{"urn": TEST_DATASET_3_URN}] # /prod/kafka1 get_browse_paths_json = { "query": get_browse_paths_query, - "variables": {"input": { "type": "DATASET", "path": ["prod", "kafka1"], "start": 0, "count": 10 } }, + "variables": { + "input": { + "type": "DATASET", + "path": ["prod", "kafka1"], + "start": 0, + "count": 10, + } + }, } response = frontend_session.post( @@ -88,16 +98,27 @@ def test_get_browse_paths(frontend_session, ingest_cleanup_data): browse = res_data["data"]["browse"] assert browse == { - "total": 3, - "entities": [{ "urn": TEST_DATASET_1_URN }, { "urn": TEST_DATASET_2_URN }, { "urn": TEST_DATASET_3_URN }], - "groups": [], - "metadata": { "path": ["prod", "kafka1"], "totalNumEntities": 0 } + "total": 3, + "entities": [ + {"urn": TEST_DATASET_1_URN}, + {"urn": TEST_DATASET_2_URN}, + {"urn": TEST_DATASET_3_URN}, + ], + "groups": [], + "metadata": {"path": ["prod", "kafka1"], "totalNumEntities": 0}, } # /prod/kafka2 get_browse_paths_json = { "query": get_browse_paths_query, - "variables": {"input": { "type": "DATASET", "path": ["prod", "kafka2"], "start": 0, "count": 10 } }, + "variables": { + "input": { + "type": "DATASET", + "path": ["prod", "kafka2"], + "start": 0, + "count": 10, + } + }, } response = frontend_session.post( @@ -113,10 +134,8 @@ def test_get_browse_paths(frontend_session, ingest_cleanup_data): browse = res_data["data"]["browse"] assert browse == { - "total": 2, - "entities": [{ "urn": TEST_DATASET_1_URN }, { "urn": TEST_DATASET_2_URN }], - "groups": [], - "metadata": { "path": ["prod", "kafka2"], "totalNumEntities": 0 } + "total": 2, + "entities": [{"urn": TEST_DATASET_1_URN}, {"urn": TEST_DATASET_2_URN}], + "groups": [], + "metadata": {"path": ["prod", "kafka2"], "totalNumEntities": 0}, } - - diff --git a/smoke-test/tests/cli/datahub-cli.py b/smoke-test/tests/cli/datahub-cli.py index 1d0080bdd9d48a..c3db6028efceb8 100644 --- a/smoke-test/tests/cli/datahub-cli.py +++ b/smoke-test/tests/cli/datahub-cli.py @@ -1,8 +1,11 @@ import json -import pytest from time import sleep -from datahub.cli.cli_utils import guess_entity_type, post_entity, get_aspects_for_entity + +import pytest +from datahub.cli.cli_utils import (get_aspects_for_entity, guess_entity_type, + post_entity) from datahub.cli.ingest_cli import get_session_and_host, rollback + from tests.utils import ingest_file_via_rest, wait_for_writes_to_sync ingested_dataset_run_id = "" @@ -24,24 +27,46 @@ def test_setup(): session, gms_host = get_session_and_host() - assert "browsePaths" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) - assert "editableDatasetProperties" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False) + assert "browsePaths" not in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["browsePaths"], typed=False + ) + assert "editableDatasetProperties" not in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False + ) - ingested_dataset_run_id = ingest_file_via_rest("tests/cli/cli_test_data.json").config.run_id + ingested_dataset_run_id = ingest_file_via_rest( + "tests/cli/cli_test_data.json" + ).config.run_id print("Setup ingestion id: " + ingested_dataset_run_id) - assert "browsePaths" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) + assert "browsePaths" in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["browsePaths"], typed=False + ) yield # Clean up rollback_url = f"{gms_host}/runs?action=rollback" - session.post(rollback_url, data=json.dumps({"runId": ingested_editable_run_id, "dryRun": False, "hardDelete": True})) - session.post(rollback_url, data=json.dumps({"runId": ingested_dataset_run_id, "dryRun": False, "hardDelete": True})) + session.post( + rollback_url, + data=json.dumps( + {"runId": ingested_editable_run_id, "dryRun": False, "hardDelete": True} + ), + ) + session.post( + rollback_url, + data=json.dumps( + {"runId": ingested_dataset_run_id, "dryRun": False, "hardDelete": True} + ), + ) - assert "browsePaths" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) - assert "editableDatasetProperties" not in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False) + assert "browsePaths" not in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["browsePaths"], typed=False + ) + assert "editableDatasetProperties" not in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False + ) @pytest.mark.dependency() @@ -49,9 +74,7 @@ def test_rollback_editable(): global ingested_dataset_run_id global ingested_editable_run_id platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-rollback" - ) + dataset_name = "test-rollback" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -59,23 +82,38 @@ def test_rollback_editable(): print("Ingested dataset id:", ingested_dataset_run_id) # Assert that second data ingestion worked - assert "browsePaths" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["browsePaths"], typed=False) + assert "browsePaths" in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["browsePaths"], typed=False + ) # Make editable change - ingested_editable_run_id = ingest_file_via_rest("tests/cli/cli_editable_test_data.json").config.run_id + ingested_editable_run_id = ingest_file_via_rest( + "tests/cli/cli_editable_test_data.json" + ).config.run_id print("ingested editable id:", ingested_editable_run_id) # Assert that second data ingestion worked - assert "editableDatasetProperties" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False) + assert "editableDatasetProperties" in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False + ) # rollback ingestion 1 rollback_url = f"{gms_host}/runs?action=rollback" - session.post(rollback_url, data=json.dumps({"runId": ingested_dataset_run_id, "dryRun": False, "hardDelete": False})) + session.post( + rollback_url, + data=json.dumps( + {"runId": ingested_dataset_run_id, "dryRun": False, "hardDelete": False} + ), + ) # Allow async MCP processor to handle ingestions & rollbacks wait_for_writes_to_sync() # EditableDatasetProperties should still be part of the entity that was soft deleted. - assert "editableDatasetProperties" in get_aspects_for_entity(entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False) + assert "editableDatasetProperties" in get_aspects_for_entity( + entity_urn=dataset_urn, aspects=["editableDatasetProperties"], typed=False + ) # But first ingestion aspects should not be present - assert "browsePaths" not in get_aspects_for_entity(entity_urn=dataset_urn, typed=False) + assert "browsePaths" not in get_aspects_for_entity( + entity_urn=dataset_urn, typed=False + ) diff --git a/smoke-test/tests/cli/datahub_graph_test.py b/smoke-test/tests/cli/datahub_graph_test.py index 16925d26f6983a..17c8924fb0998b 100644 --- a/smoke-test/tests/cli/datahub_graph_test.py +++ b/smoke-test/tests/cli/datahub_graph_test.py @@ -1,13 +1,11 @@ import pytest import tenacity from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph -from datahub.metadata.schema_classes import KafkaSchemaClass, SchemaMetadataClass -from tests.utils import ( - delete_urns_from_file, - get_gms_url, - get_sleep_info, - ingest_file_via_rest, -) +from datahub.metadata.schema_classes import (KafkaSchemaClass, + SchemaMetadataClass) + +from tests.utils import (delete_urns_from_file, get_gms_url, get_sleep_info, + ingest_file_via_rest) sleep_sec, sleep_times = get_sleep_info() diff --git a/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py b/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py index 4288a61b7a0c16..106da7cd8d71e5 100644 --- a/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py +++ b/smoke-test/tests/cli/delete_cmd/test_timeseries_delete.py @@ -1,21 +1,22 @@ import json import logging +import sys import tempfile import time -import sys from json import JSONDecodeError from typing import Any, Dict, List, Optional -from click.testing import CliRunner, Result - import datahub.emitter.mce_builder as builder +from click.testing import CliRunner, Result from datahub.emitter.serialization_helper import pre_json_transform from datahub.entrypoints import datahub from datahub.metadata.schema_classes import DatasetProfileClass + +import requests_wrapper as requests from tests.aspect_generators.timeseries.dataset_profile_gen import \ gen_dataset_profiles -from tests.utils import get_strftime_from_timestamp_millis, wait_for_writes_to_sync -import requests_wrapper as requests +from tests.utils import (get_strftime_from_timestamp_millis, + wait_for_writes_to_sync) logger = logging.getLogger(__name__) @@ -33,6 +34,7 @@ def sync_elastic() -> None: wait_for_writes_to_sync() + def datahub_put_profile(dataset_profile: DatasetProfileClass) -> None: with tempfile.NamedTemporaryFile("w+t", suffix=".json") as aspect_file: aspect_text: str = json.dumps(pre_json_transform(dataset_profile.to_obj())) diff --git a/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py b/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py index 61e7a5a65b494a..e962b1a5cafd6a 100644 --- a/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py +++ b/smoke-test/tests/cli/ingest_cmd/test_timeseries_rollback.py @@ -2,14 +2,14 @@ import time from typing import Any, Dict, List, Optional -from click.testing import CliRunner, Result - import datahub.emitter.mce_builder as builder +from click.testing import CliRunner, Result from datahub.emitter.serialization_helper import post_json_transform from datahub.entrypoints import datahub from datahub.metadata.schema_classes import DatasetProfileClass -from tests.utils import ingest_file_via_rest, wait_for_writes_to_sync + import requests_wrapper as requests +from tests.utils import ingest_file_via_rest, wait_for_writes_to_sync runner = CliRunner(mix_stderr=False) diff --git a/smoke-test/tests/cli/user_groups_cmd/test_group_cmd.py b/smoke-test/tests/cli/user_groups_cmd/test_group_cmd.py index 405e061c016f97..7b986d3be0444d 100644 --- a/smoke-test/tests/cli/user_groups_cmd/test_group_cmd.py +++ b/smoke-test/tests/cli/user_groups_cmd/test_group_cmd.py @@ -1,6 +1,7 @@ import json import sys import tempfile +import time from typing import Any, Dict, Iterable, List import yaml @@ -8,7 +9,7 @@ from datahub.api.entities.corpgroup.corpgroup import CorpGroup from datahub.entrypoints import datahub from datahub.ingestion.graph.client import DataHubGraph, get_default_graph -import time + import requests_wrapper as requests from tests.utils import wait_for_writes_to_sync diff --git a/smoke-test/tests/conftest.py b/smoke-test/tests/conftest.py index eed7a983197eff..57b92a2db1c195 100644 --- a/smoke-test/tests/conftest.py +++ b/smoke-test/tests/conftest.py @@ -2,8 +2,8 @@ import pytest -from tests.utils import wait_for_healthcheck_util, get_frontend_session from tests.test_result_msg import send_message +from tests.utils import get_frontend_session, wait_for_healthcheck_util # Disable telemetry os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" @@ -28,5 +28,5 @@ def test_healthchecks(wait_for_healthchecks): def pytest_sessionfinish(session, exitstatus): - """ whole test run finishes. """ + """whole test run finishes.""" send_message(exitstatus) diff --git a/smoke-test/tests/consistency_utils.py b/smoke-test/tests/consistency_utils.py index 15993733c592b7..607835bf3649c0 100644 --- a/smoke-test/tests/consistency_utils.py +++ b/smoke-test/tests/consistency_utils.py @@ -1,10 +1,16 @@ -import time +import logging import os import subprocess +import time _ELASTIC_BUFFER_WRITES_TIME_IN_SEC: int = 1 USE_STATIC_SLEEP: bool = bool(os.getenv("USE_STATIC_SLEEP", False)) -ELASTICSEARCH_REFRESH_INTERVAL_SECONDS: int = int(os.getenv("ELASTICSEARCH_REFRESH_INTERVAL_SECONDS", 5)) +ELASTICSEARCH_REFRESH_INTERVAL_SECONDS: int = int( + os.getenv("ELASTICSEARCH_REFRESH_INTERVAL_SECONDS", 5) +) + +logger = logging.getLogger(__name__) + def wait_for_writes_to_sync(max_timeout_in_sec: int = 120) -> None: if USE_STATIC_SLEEP: @@ -30,7 +36,9 @@ def wait_for_writes_to_sync(max_timeout_in_sec: int = 120) -> None: lag_zero = True if not lag_zero: - logger.warning(f"Exiting early from waiting for elastic to catch up due to a timeout. Current lag is {lag_values}") + logger.warning( + f"Exiting early from waiting for elastic to catch up due to a timeout. Current lag is {lag_values}" + ) else: # we want to sleep for an additional period of time for Elastic writes buffer to clear - time.sleep(_ELASTIC_BUFFER_WRITES_TIME_IN_SEC) \ No newline at end of file + time.sleep(_ELASTIC_BUFFER_WRITES_TIME_IN_SEC) diff --git a/smoke-test/tests/containers/containers_test.py b/smoke-test/tests/containers/containers_test.py index 575e3def6cf232..227645a87d30ad 100644 --- a/smoke-test/tests/containers/containers_test.py +++ b/smoke-test/tests/containers/containers_test.py @@ -1,5 +1,7 @@ import pytest -from tests.utils import delete_urns_from_file, get_frontend_url, ingest_file_via_rest + +from tests.utils import (delete_urns_from_file, get_frontend_url, + ingest_file_via_rest) @pytest.fixture(scope="module", autouse=False) @@ -225,6 +227,7 @@ def test_update_container(frontend_session, ingest_cleanup_data): "ownerUrn": new_owner, "resourceUrn": container_urn, "ownerEntityType": "CORP_USER", + "ownershipTypeUrn": "urn:li:ownershipType:__system__technical_owner" } }, } diff --git a/smoke-test/tests/cypress/cypress/e2e/domains/nested_domains.js b/smoke-test/tests/cypress/cypress/e2e/domains/nested_domains.js new file mode 100644 index 00000000000000..a2d4de0f51659e --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/domains/nested_domains.js @@ -0,0 +1,53 @@ +const domainName = "CypressNestedDomain"; +const domainDescription = "CypressNestedDomainDescription"; + +describe("nested domains test", () => { + + it("create a domain, move under parent, remove domain", () => { + // Create a new domain without a parent + cy.loginWithCredentials(); + cy.goToDomainList(); + cy.clickOptionWithTestId("domains-new-domain-button"); + cy.get('[data-testid="create-domain-name"]').click().type(domainName); + cy.get('[data-testid="create-domain-description"]').click().type(domainDescription); + cy.clickOptionWithTestId("create-domain-button"); + cy.waitTextVisible(domainName); + + // Ensure the new domain has no parent in the navigation sidebar + cy.waitTextVisible(domainDescription); + + // Move a domain from the root level to be under a parent domain + cy.clickOptionWithText(domainName); + cy.openThreeDotDropdown(); + cy.clickOptionWithTestId("entity-menu-move-button"); + cy.get('[data-testid="move-domain-modal"]').contains("Marketing").click({force: true}); + cy.get('[data-testid="move-domain-modal"]').contains("Marketing").should("be.visible"); + cy.clickOptionWithTestId("move-domain-modal-move-button").wait(5000); + + // Wnsure domain is no longer on the sidebar navigator at the top level but shows up under the parent + cy.goToDomainList(); + cy.ensureTextNotPresent(domainName); + cy.ensureTextNotPresent(domainDescription); + cy.waitTextVisible("1 sub-domain"); + + // Move a domain from under a parent domain to the root level + cy.get('[data-testid="domain-list-item"]').contains("Marketing").prev().click(); + cy.clickOptionWithText(domainName); + cy.openThreeDotDropdown(); + cy.clickOptionWithTestId("entity-menu-move-button"); + cy.clickOptionWithTestId("move-domain-modal-move-button").wait(5000); + cy.goToDomainList(); + cy.waitTextVisible(domainName); + cy.waitTextVisible(domainDescription); + + // Delete a domain + cy.clickOptionWithText(domainName).wait(3000); + cy.openThreeDotDropdown(); + cy.clickOptionWithTestId("entity-menu-delete-button"); + cy.waitTextVisible("Are you sure you want to remove this Domain?"); + cy.clickOptionWithText("Yes"); + cy.waitTextVisible("Deleted Domain!"); + cy.ensureTextNotPresent(domainName); + cy.ensureTextNotPresent(domainDescription); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js index e0d2bf240d74d6..aeceaf99be889c 100644 --- a/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js +++ b/smoke-test/tests/cypress/cypress/e2e/glossary/glossary_navigation.js @@ -4,42 +4,47 @@ const glossaryParentGroup = "Cypress"; describe("glossary sidebar navigation test", () => { it("create term and term parent group, move and delete term group", () => { - //create a new term group and term, move term to the group + + // Create a new term group and term, move term to the group cy.loginWithCredentials(); cy.goToGlossaryList(); - cy.clickOptionWithText("Add Term Group"); + cy.clickOptionWithTestId("add-term-group-button"); cy.waitTextVisible("Create Term Group"); - cy.get(".ant-input-affix-wrapper > input[type='text']").first().type(glossaryTermGroup); - cy.get(".ant-modal-footer > button:last-child").click(); - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTermGroup).should("be.visible"); - cy.clickOptionWithText("Add Term"); + cy.enterTextInTestId("create-glossary-entity-modal-name", glossaryTermGroup); + cy.clickOptionWithTestId("glossary-entity-modal-create-button"); + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).should("be.visible"); + cy.clickOptionWithTestId("add-term-button"); + cy.waitTextVisible("Created Term Group!"); cy.waitTextVisible("Create Glossary Term"); - cy.get(".ant-input-affix-wrapper > input[type='text']").first().type(glossaryTerm); - cy.get(".ant-modal-footer > button:last-child").click(); - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTerm).click(); - cy.waitTextVisible("No documentation yet"); + cy.enterTextInTestId("create-glossary-entity-modal-name", glossaryTerm); + cy.clickOptionWithTestId("glossary-entity-modal-create-button").wait(3000); + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTerm).click().wait(3000); cy.openThreeDotDropdown(); - cy.clickOptionWithText("Move"); - cy.get('[role="dialog"]').contains(glossaryTermGroup).click({force: true}); - cy.get('[role="dialog"]').contains(glossaryTermGroup).should("be.visible"); - cy.get("button").contains("Move").click(); + cy.clickOptionWithTestId("entity-menu-move-button") + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryTermGroup).click({force: true}); + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryTermGroup).should("be.visible"); + cy.clickOptionWithTestId("glossary-entity-modal-move-button"); cy.waitTextVisible("Moved Glossary Term!"); - //ensure the new term is under the parent term group in the navigation sidebar - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryTermGroup).click(); + + // Ensure the new term is under the parent term group in the navigation sidebar + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryTermGroup).click(); cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTerm).should("be.visible"); - //move a term group from the root level to be under a parent term group + + // Move a term group from the root level to be under a parent term group cy.goToGlossaryList(); cy.clickOptionWithText(glossaryTermGroup); cy.openThreeDotDropdown(); cy.clickOptionWithText("Move"); - cy.get('[role="dialog"]').contains(glossaryParentGroup).click({force: true}); - cy.get('[role="dialog"]').contains(glossaryParentGroup).should("be.visible"); - cy.get("button").contains("Move").click(); + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryParentGroup).click({force: true}); + cy.get('[data-testid="move-glossary-entity-modal"]').contains(glossaryParentGroup).should("be.visible"); + cy.clickOptionWithTestId("glossary-entity-modal-move-button"); cy.waitTextVisible("Moved Term Group!"); - //ensure it is no longer on the sidebar navigator at the top level but shows up under the new parent - cy.get('*[class^="GlossaryBrowser"]').contains(glossaryParentGroup).click(); + + // Ensure it is no longer on the sidebar navigator at the top level but shows up under the new parent + cy.get('[data-testid="glossary-browser-sidebar"]').contains(glossaryParentGroup).click(); cy.get('*[class^="GlossaryEntitiesList"]').contains(glossaryTermGroup).should("be.visible"); - //delete a term group + + // Delete a term group cy.goToGlossaryList(); cy.clickOptionWithText(glossaryParentGroup); cy.clickOptionWithText(glossaryTermGroup); @@ -50,7 +55,8 @@ describe("glossary sidebar navigation test", () => { cy.clickOptionWithText(glossaryTermGroup).wait(3000); cy.deleteFromDropdown(); cy.waitTextVisible("Deleted Term Group!"); - //ensure it is no longer in the sidebar navigator + + // Ensure it is no longer in the sidebar navigator cy.ensureTextNotPresent(glossaryTerm); cy.ensureTextNotPresent(glossaryTermGroup); }); diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/download_lineage_results.js b/smoke-test/tests/cypress/cypress/e2e/lineage/download_lineage_results.js new file mode 100644 index 00000000000000..ed4167b87c5060 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/download_lineage_results.js @@ -0,0 +1,83 @@ +const test_dataset = "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)"; +const first_degree = [ + "urn:li:chart:(looker,cypress_baz1)", + "urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleCypressHdfsDataset,PROD)", + "urn:li:mlFeature:(cypress-test-2,some-cypress-feature-1)" +]; +const second_degree = [ + "urn:li:chart:(looker,cypress_baz2)", + "urn:li:dashboard:(looker,cypress_baz)", + "urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)", + "urn:li:mlPrimaryKey:(cypress-test-2,some-cypress-feature-2)" +]; +const third_degree_plus = [ + "urn:li:dataJob:(urn:li:dataFlow:(airflow,cypress_dag_abc,PROD),cypress_task_123)", + "urn:li:dataJob:(urn:li:dataFlow:(airflow,cypress_dag_abc,PROD),cypress_task_456)", + "urn:li:dataset:(urn:li:dataPlatform:hive,cypress_logging_events,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_created,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_created_no_tag,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:hive,fct_cypress_users_deleted,PROD)" +]; +const downloadCsvFile = (filename) => { + cy.get('[data-testid="three-dot-menu"]').click(); + cy.get('[data-testid="download-as-csv-menu-item"]').click(); + cy.get('[data-testid="download-as-csv-input"]').clear().type(filename); + cy.get('[data-testid="csv-modal-download-button"]').click().wait(5000); + cy.ensureTextNotPresent("Creating CSV to download"); +}; + +describe("download lineage results to .csv file", () => { + beforeEach(() => { + cy.on('uncaught:exception', (err, runnable) => { return false; }); + }); + + it("download and verify lineage results for 1st, 2nd and 3+ degree of dependencies", () => { + cy.loginWithCredentials(); + cy.goToDataset(test_dataset,"SampleCypressKafkaDataset"); + cy.openEntityTab("Lineage"); + + // Verify 1st degree of dependencies + cy.contains(/1 - [3-4] of [3-4]/); + downloadCsvFile("first_degree_results.csv"); + let first_degree_csv = cy.readFile('cypress/downloads/first_degree_results.csv'); + first_degree.forEach(function (urn) { + first_degree_csv.should('contain', urn) + }); + second_degree.forEach(function (urn) { + first_degree_csv.should('not.contain', urn) + }); + third_degree_plus.forEach(function (urn) { + first_degree_csv.should('not.contain', urn); + }); + + // Verify 1st and 2nd degree of dependencies + cy.get('[data-testid="facet-degree-2"]').click().wait(5000); + cy.contains(/1 - [7-8] of [7-8]/); + downloadCsvFile("second_degree_results.csv"); + let second_degree_csv = cy.readFile('cypress/downloads/second_degree_results.csv'); + first_degree.forEach(function (urn) { + second_degree_csv.should('contain', urn) + }); + second_degree.forEach(function (urn) { + second_degree_csv.should('contain', urn) + }); + third_degree_plus.forEach(function (urn) { + second_degree_csv.should('not.contain', urn); + }); + + // Verify 1st 2nd and 3+ degree of dependencies(Verify multi page download) + cy.get('[data-testid="facet-degree-3+"]').click().wait(5000); + cy.contains(/1 - 10 of 1[3-4]/); + downloadCsvFile("third_plus_degree_results.csv"); + let third_degree_csv = cy.readFile('cypress/downloads/third_plus_degree_results.csv'); + first_degree.forEach(function (urn) { + third_degree_csv.should('contain', urn) + }); + second_degree.forEach(function (urn) { + third_degree_csv.should('contain', urn) + }); + third_degree_plus.forEach(function (urn) { + third_degree_csv.should('contain', urn); + }); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js new file mode 100644 index 00000000000000..37ca62c8d12291 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_column_path.js @@ -0,0 +1,68 @@ +import { aliasQuery } from "../utils"; +const DATASET_ENTITY_TYPE = 'dataset'; +const DATASET_URN = 'urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleCypressHdfsDataset,PROD)'; +const DOWNSTREAM_DATASET_URN = "urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)"; +const upstreamColumn = '[data-testid="node-urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)-Upstream"] text'; +const downstreamColumn = '[data-testid="node-urn:li:dataset:(urn:li:dataPlatform:hdfs,SampleCypressHdfsDataset,PROD)-Downstream"] text'; + +const verifyColumnPathModal = (from, to) => { + cy.get('[data-testid="entity-paths-modal"]').contains(from).should("be.visible"); + cy.get('[data-testid="entity-paths-modal"]').contains(to).should("be.visible"); +}; + +describe("column-Level lineage and impact analysis path test", () => { + beforeEach(() => { + cy.on('uncaught:exception', (err, runnable) => { return false; }); + cy.intercept("POST", "/api/v2/graphql", (req) => { + aliasQuery(req, "appConfig"); + }); + }); + + it("verify column-level lineage path at lineage praph and impact analysis ", () => { + // Open dataset with column-level lineage configured an navigate to lineage tab -> visualize lineage + cy.loginWithCredentials(); + cy.goToEntityLineageGraph(DATASET_ENTITY_TYPE, DATASET_URN); + + // Enable “show columns” toggle + cy.waitTextVisible("SampleCypressHdfs"); + cy.clickOptionWithTestId("column-toggle"); + cy.waitTextVisible("shipment_info"); + + // Verify functionality of column lineage + cy.get(upstreamColumn).eq(3).click(); + cy.get(upstreamColumn).eq(3).prev().should('not.have.attr', 'fill', 'white'); + cy.get(downstreamColumn).eq(2).prev().should('not.have.attr', 'stroke', 'transparent'); + cy.get(downstreamColumn).eq(2).click(); + cy.get(downstreamColumn).eq(2).prev().should('not.have.attr', 'fill', 'white'); + cy.get(upstreamColumn).eq(3).prev().should('not.have.attr', 'stroke', 'transparent'); + + // Open dataset impact analysis view, enable column lineage + cy.goToDataset(DATASET_URN, "SampleCypressHdfsDataset"); + cy.openEntityTab("Lineage"); + cy.clickOptionWithText("Column Lineage"); + cy.clickOptionWithText("Downstream"); + + // Verify upstream column lineage, test column path modal + cy.clickOptionWithText("Upstream"); + cy.waitTextVisible("SampleCypressKafkaDataset"); + cy.ensureTextNotPresent("field_bar"); + cy.contains("Select column").click({ force: true}).wait(1000); + cy.get(".rc-virtual-list").contains("shipment_info").click(); + cy.waitTextVisible("field_bar"); + cy.clickOptionWithText("field_bar"); + verifyColumnPathModal("shipment_info", "field_bar"); + cy.get('[data-testid="entity-paths-modal"] [data-icon="close"]').click(); + + // Verify downstream column lineage, test column path modal + cy.goToDataset(DOWNSTREAM_DATASET_URN, "SampleCypressKafkaDataset"); + cy.openEntityTab("Lineage"); + cy.clickOptionWithText("Column Lineage"); + cy.ensureTextNotPresent("shipment_info"); + cy.contains("Select column").click({ force: true}).wait(1000); + cy.get(".rc-virtual-list").contains("field_bar").click(); + cy.waitTextVisible("shipment_info"); + cy.clickOptionWithText("shipment_info"); + verifyColumnPathModal("shipment_info", "field_bar"); + cy.get('[data-testid="entity-paths-modal"] [data-icon="close"]').click(); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_graph.js b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_graph.js index 9e035f7f897722..85db210649c27b 100644 --- a/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_graph.js +++ b/smoke-test/tests/cypress/cypress/e2e/lineage/lineage_graph.js @@ -5,8 +5,6 @@ const TASKS_ENTITY_TYPE = 'tasks'; const DATASET_URN = 'urn:li:dataset:(urn:li:dataPlatform:kafka,SampleCypressKafkaDataset,PROD)'; const JAN_1_2021_TIMESTAMP = 1609553357755; const JAN_1_2022_TIMESTAMP = 1641089357755; -const TIMESTAMP_MILLIS_EIGHT_DAYS_AGO = getTimestampMillisNumDaysAgo(8); -const TIMESTAMP_MILLIS_ONE_DAY_AGO = getTimestampMillisNumDaysAgo(1); const TIMESTAMP_MILLIS_14_DAYS_AGO = getTimestampMillisNumDaysAgo(14); const TIMESTAMP_MILLIS_7_DAYS_AGO = getTimestampMillisNumDaysAgo(7); const TIMESTAMP_MILLIS_NOW = getTimestampMillisNumDaysAgo(0); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_ownership.js b/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_ownership.js index fcc0566f3f6cee..465d7998b9f9a4 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_ownership.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/dataset_ownership.js @@ -5,7 +5,7 @@ const password = "Example password"; const group_name = `Test group ${test_id}`; const addOwner = (owner, type, elementId) => { - cy.clickOptionWithText("Add Owners"); + cy.clickOptionWithTestId("add-owners-button"); cy.contains("Search for users or groups...").click({ force: true }); cy.focused().type(owner); cy.clickOptionWithText(owner); @@ -29,6 +29,10 @@ const addOwner = (owner, type, elementId) => { } describe("add, remove ownership for dataset", () => { + beforeEach(() => { + cy.on('uncaught:exception', (err, runnable) => { return false; }); + }); + it("create test user and test group, add user to a group", () => { cy.loginWithCredentials(); cy.createUser(username, password, email); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js index 83b66e2cb2549d..5f9758a35ca0ea 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/edit_documentation.js @@ -10,20 +10,20 @@ describe("edit documentation and link to dataset", () => { cy.visit( "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" ); - cy.get("[role='tab']").contains("Documentation").click(); + cy.openEntityTab("Documentation"); cy.waitTextVisible("my hive dataset"); cy.waitTextVisible("Sample doc"); - cy.clickOptionWithText("Edit"); + cy.clickOptionWithTestId("edit-documentation-button"); cy.focused().clear(); cy.focused().type(documentation_edited); - cy.get("button").contains("Save").click(); + cy.clickOptionWithTestId("description-editor-save-button"); cy.waitTextVisible("Description Updated"); cy.waitTextVisible(documentation_edited); //return documentation to original state - cy.clickOptionWithText("Edit"); + cy.clickOptionWithTestId("edit-documentation-button"); cy.focused().clear().wait(1000); cy.focused().type("my hive dataset"); - cy.get("button").contains("Save").click(); + cy.clickOptionWithTestId("description-editor-save-button"); cy.waitTextVisible("Description Updated"); cy.waitTextVisible("my hive dataset"); }); @@ -33,21 +33,21 @@ describe("edit documentation and link to dataset", () => { cy.visit( "/dataset/urn:li:dataset:(urn:li:dataPlatform:hive,SampleCypressHiveDataset,PROD)/Schema" ); - cy.get("[role='tab']").contains("Documentation").click(); + cy.openEntityTab("Documentation"); cy.contains("Sample doc").trigger("mouseover", { force: true }); cy.get('[data-icon="delete"]').click(); cy.waitTextVisible("Link Removed"); - cy.get("button").contains("Add Link").click().wait(1000); - cy.get('[role="dialog"] #addLinkForm_url').type(wrong_url); + cy.clickOptionWithTestId("add-link-button").wait(1000); + cy.enterTextInTestId("add-link-modal-url", wrong_url); cy.waitTextVisible("This field must be a valid url."); cy.focused().clear(); cy.waitTextVisible("A URL is required."); - cy.focused().type(correct_url); + cy.enterTextInTestId("add-link-modal-url", correct_url); cy.ensureTextNotPresent("This field must be a valid url."); - cy.get("#addLinkForm_label").type("Sample doc"); - cy.get('[role="dialog"] button').contains("Add").click(); + cy.enterTextInTestId("add-link-modal-label", "Sample doc"); + cy.clickOptionWithTestId("add-link-modal-add-button"); cy.waitTextVisible("Link Added"); - cy.get("[role='tab']").contains("Documentation").click(); + cy.openEntityTab("Documentation"); cy.get(`[href='${correct_url}']`).should("be.visible"); }); @@ -55,18 +55,18 @@ describe("edit documentation and link to dataset", () => { cy.loginWithCredentials(); cy.visit("/domain/urn:li:domain:marketing/Entities"); cy.waitTextVisible("SampleCypressKafkaDataset"); - cy.get("button").contains("Add Link").click().wait(1000); - cy.get('[role="dialog"] #addLinkForm_url').type(wrong_url); + cy.clickOptionWithTestId("add-link-button").wait(1000); + cy.enterTextInTestId("add-link-modal-url", wrong_url); cy.waitTextVisible("This field must be a valid url."); cy.focused().clear(); cy.waitTextVisible("A URL is required."); - cy.focused().type(correct_url); + cy.enterTextInTestId("add-link-modal-url", correct_url); cy.ensureTextNotPresent("This field must be a valid url."); - cy.get("#addLinkForm_label").type("Sample doc"); - cy.get('[role="dialog"] button').contains("Add").click(); + cy.enterTextInTestId("add-link-modal-label", "Sample doc"); + cy.clickOptionWithTestId("add-link-modal-add-button"); cy.waitTextVisible("Link Added"); - cy.get("[role='tab']").contains("Documentation").click(); - cy.waitTextVisible("Edit"); + cy.openEntityTab("Documentation"); + cy.get("[data-testid='edit-documentation-button']").should("be.visible"); cy.get(`[href='${correct_url}']`).should("be.visible"); cy.contains("Sample doc").trigger("mouseover", { force: true }); cy.get('[data-icon="delete"]').click(); @@ -83,14 +83,14 @@ describe("edit documentation and link to dataset", () => { cy.waitTextVisible("Foo field description has changed"); cy.focused().clear().wait(1000); cy.focused().type(documentation_edited); - cy.get("button").contains("Update").click(); + cy.clickOptionWithTestId("description-modal-update-button"); cy.waitTextVisible("Updated!"); cy.waitTextVisible(documentation_edited); cy.waitTextVisible("(edited)"); cy.get("tbody [data-icon='edit']").first().click({ force: true }); cy.focused().clear().wait(1000); cy.focused().type("Foo field description has changed"); - cy.get("button").contains("Update").click(); + cy.clickOptionWithTestId("description-modal-update-button"); cy.waitTextVisible("Updated!"); cy.waitTextVisible("Foo field description has changed"); cy.waitTextVisible("(edited)"); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js b/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js new file mode 100644 index 00000000000000..6c5dd778106448 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/ingestion_source.js @@ -0,0 +1,68 @@ + +const number = Math.floor(Math.random() * 100000); +const accound_id = `account${number}`; +const warehouse_id = `warehouse${number}`; +const username = `user${number}`; +const password = `password${number}`; +const role = `role${number}`; +const ingestion_source_name = `ingestion source ${number}`; + +describe("ingestion source creation flow", () => { + it("create a ingestion source using ui, verify ingestion source details saved correctly, remove ingestion source", () => { + // Go to ingestion page, create a snowflake source + cy.loginWithCredentials(); + cy.goToIngestionPage(); + cy.clickOptionWithTestId("create-ingestion-source-button"); + cy.clickOptionWithText("Snowflake"); + cy.waitTextVisible("Snowflake Recipe"); + cy.get("#account_id").type(accound_id); + cy.get("#warehouse").type(warehouse_id); + cy.get("#username").type(username); + cy.get("#password").type(password); + cy.focused().blur(); + cy.get("#role").type(role); + + // Verify yaml recipe is generated correctly + cy.clickOptionWithTestId("recipe-builder-yaml-button"); + cy.waitTextVisible("account_id"); + cy.waitTextVisible(accound_id); + cy.waitTextVisible(warehouse_id); + cy.waitTextVisible(username); + cy.waitTextVisible(password); + cy.waitTextVisible(role); + + // Finish creating source + cy.clickOptionWithTestId("recipe-builder-next-button"); + cy.waitTextVisible("Configure an Ingestion Schedule"); + cy.clickOptionWithTestId("ingestion-schedule-next-button"); + cy.waitTextVisible("Give this ingestion source a name."); + cy.get('[data-testid="source-name-input"]').type(ingestion_source_name); + cy.clickOptionWithTestId("ingestion-source-save-button"); + cy.waitTextVisible("Successfully created ingestion source!").wait(5000) + cy.waitTextVisible(ingestion_source_name); + cy.get('[data-testid="ingestion-source-table-status"]').contains("Pending...").should("be.visible"); + + // Verify ingestion source details are saved correctly + cy.get('[data-testid="ingestion-source-table-edit-button"]').first().click(); + cy.waitTextVisible("Edit Ingestion Source"); + cy.get("#account_id").should("have.value", accound_id); + cy.get("#warehouse").should("have.value", warehouse_id); + cy.get("#username").should("have.value", username); + cy.get("#password").should("have.value", password); + cy.get("#role").should("have.value", role); + cy.get("button").contains("Next").click(); + cy.waitTextVisible("Configure an Ingestion Schedule"); + cy.clickOptionWithTestId("ingestion-schedule-next-button"); + cy.get('[data-testid="source-name-input"]').clear().type(ingestion_source_name + " EDITED"); + cy.clickOptionWithTestId("ingestion-source-save-button"); + cy.waitTextVisible("Successfully updated ingestion source!"); + cy.waitTextVisible(ingestion_source_name + " EDITED"); + + // Remove ingestion source + cy.get('[data-testid="delete-button"]').first().click(); + cy.waitTextVisible("Confirm Ingestion Source Removal"); + cy.get("button").contains("Yes").click(); + cy.waitTextVisible("Removed ingestion source."); + cy.ensureTextNotPresent(ingestion_source_name + " EDITED") + }) +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js index 24a24cc21138df..3d052695e818f9 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/managed_ingestion.js @@ -7,7 +7,7 @@ describe("run managed ingestion", () => { it("create run managed ingestion source", () => { let number = Math.floor(Math.random() * 100000); let testName = `cypress test source ${number}` - let cli_version = "0.10.5.4"; + let cli_version = "0.12.0"; cy.login(); cy.goToIngestionPage(); cy.clickOptionWithText("Create new source"); diff --git a/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js b/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js index 466bb2ef0757e7..77fd63b9cae02f 100644 --- a/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js +++ b/smoke-test/tests/cypress/cypress/e2e/mutations/managing_secrets.js @@ -8,23 +8,24 @@ const ingestion_source_name = `ingestion source ${number}`; describe("managing secrets for ingestion creation", () => { it("create a secret, create ingestion source using a secret, remove a secret", () => { + // Navigate to the manage ingestion page → secrets cy.loginWithCredentials(); - //navigate to the manage ingestion page → secrets cy.goToIngestionPage(); - cy.clickOptionWithText("Secrets"); - //create a new secret - cy.clickOptionWithText("Create new secret"); - cy.get('[role="dialog"]').contains("Create a new Secret").should("be.visible"); - cy.get('[role="dialog"] #name').type(`secretname${number}`); - cy.get('[role="dialog"] #value').type(`secretvalue${number}`); - cy.get('[role="dialog"] #description').type(`secretdescription${number}`); - cy.get('#createSecretButton').click(); + cy.openEntityTab("Secrets"); + + // Create a new secret + cy.clickOptionWithTestId("create-secret-button"); + cy.enterTextInTestId('secret-modal-name-input', `secretname${number}`); + cy.enterTextInTestId('secret-modal-value-input', `secretvalue${number}`); + cy.enterTextInTestId('secret-modal-description-input', `secretdescription${number}`); + cy.clickOptionWithTestId("secret-modal-create-button"); cy.waitTextVisible("Successfully created Secret!"); cy.waitTextVisible(`secretname${number}`); - cy.waitTextVisible(`secretdescription${number}`).wait(5000)//prevent issue with missing secret - //create an ingestion source using a secret + cy.waitTextVisible(`secretdescription${number}`).wait(5000) + + // Create an ingestion source using a secret cy.goToIngestionPage(); - cy.clickOptionWithText("Create new source"); + cy.get("#ingestion-create-source").click(); cy.clickOptionWithText("Snowflake"); cy.waitTextVisible("Snowflake Recipe"); cy.get("#account_id").type(accound_id); @@ -40,11 +41,12 @@ describe("managing secrets for ingestion creation", () => { cy.waitTextVisible("Give this ingestion source a name."); cy.get('[data-testid="source-name-input"]').type(ingestion_source_name); cy.get("button").contains("Save").click(); - cy.waitTextVisible("Successfully created ingestion source!").wait(5000)//prevent issue with missing form data + cy.waitTextVisible("Successfully created ingestion source!").wait(5000) cy.waitTextVisible(ingestion_source_name); cy.get("button").contains("Pending...").should("be.visible"); - //remove a secret - cy.clickOptionWithText("Secrets"); + + // Remove a secret + cy.openEntityTab("Secrets"); cy.waitTextVisible(`secretname${number}`); cy.get('[data-icon="delete"]').first().click(); cy.waitTextVisible("Confirm Secret Removal"); @@ -52,14 +54,16 @@ describe("managing secrets for ingestion creation", () => { cy.waitTextVisible("Removed secret."); cy.ensureTextNotPresent(`secretname${number}`); cy.ensureTextNotPresent(`secretdescription${number}`); - //remove ingestion source + + // Remove ingestion source cy.goToIngestionPage(); cy.get('[data-testid="delete-button"]').first().click(); cy.waitTextVisible("Confirm Ingestion Source Removal"); cy.get("button").contains("Yes").click(); cy.waitTextVisible("Removed ingestion source."); cy.ensureTextNotPresent(ingestion_source_name) - //verify secret is not present during ingestion source creation for password dropdown + + // Verify secret is not present during ingestion source creation for password dropdown cy.clickOptionWithText("Create new source"); cy.clickOptionWithText("Snowflake"); cy.waitTextVisible("Snowflake Recipe"); @@ -68,13 +72,13 @@ describe("managing secrets for ingestion creation", () => { cy.get("#username").type(username); cy.get("#password").click().wait(1000); cy.ensureTextNotPresent(`secretname${number}`); - //verify secret can be added during ingestion source creation and used successfully + + // Verify secret can be added during ingestion source creation and used successfully cy.clickOptionWithText("Create Secret"); - cy.get('[role="dialog"]').contains("Create a new Secret").should("be.visible"); - cy.get('[role="dialog"] #name').type(`secretname${number}`); - cy.get('[role="dialog"] #value').type(`secretvalue${number}`); - cy.get('[role="dialog"] #description').type(`secretdescription${number}`); - cy.get('#createSecretButton').click(); + cy.enterTextInTestId('secret-modal-name-input', `secretname${number}`) + cy.enterTextInTestId('secret-modal-value-input', `secretvalue${number}`) + cy.enterTextInTestId('secret-modal-description-input', `secretdescription${number}`) + cy.clickOptionWithTestId("secret-modal-create-button"); cy.waitTextVisible("Created secret!"); cy.get("#role").type(role); cy.get("button").contains("Next").click(); @@ -86,6 +90,7 @@ describe("managing secrets for ingestion creation", () => { cy.waitTextVisible("Successfully created ingestion source!").wait(5000)//prevent issue with missing form data cy.waitTextVisible(ingestion_source_name); cy.get("button").contains("Pending...").should("be.visible"); + //Remove ingestion source and secret cy.goToIngestionPage(); cy.get('[data-testid="delete-button"]').first().click(); diff --git a/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js b/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js new file mode 100644 index 00000000000000..4637310b864968 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/search/query_and_filter_search.js @@ -0,0 +1,57 @@ +describe("auto-complete dropdown, filter plus query search test", () => { + + const platformQuerySearch = (query,test_id,active_filter) => { + cy.visit("/"); + cy.get("input[data-testid=search-input]").type(query); + cy.get(`[data-testid="quick-filter-urn:li:dataPlatform:${test_id}"]`).click(); + cy.focused().type("{enter}").wait(3000); + cy.url().should( + "include", + `?filter_platform___false___EQUAL___0=urn%3Ali%3AdataPlatform%3A${test_id}` + ); + cy.get('[data-testid="search-input"]').should("have.value", query); + cy.get(`[data-testid="active-filter-${active_filter}"]`).should("be.visible"); + cy.contains("of 0 results").should("not.exist"); + cy.contains(/of [0-9]+ results/); + } + + const entityQuerySearch = (query,test_id,active_filter) => { + cy.visit("/"); + cy.get("input[data-testid=search-input]").type(query); + cy.get(`[data-testid="quick-filter-${test_id}"]`).click(); + cy.focused().type("{enter}").wait(3000); + cy.url().should( + "include", + `?filter__entityType___false___EQUAL___0=${test_id}` + ); + cy.get('[data-testid="search-input"]').should("have.value", query); + cy.get(`[data-testid="active-filter-${active_filter}"]`).should("be.visible"); + cy.contains("of 0 results").should("not.exist"); + cy.contains(/of [0-9]+ results/); + } + + it("verify the 'filter by' section + query (result in search page with query applied + filter applied)", () => { + // Platform query plus filter test + cy.loginWithCredentials(); + // Airflow + platformQuerySearch ("cypress","airflow","Airflow"); + // BigQuery + platformQuerySearch ("cypress","bigquery","BigQuery"); + // dbt + platformQuerySearch ("cypress","dbt","dbt"); + // Hive + platformQuerySearch ("cypress","hive","Hive"); + + // Entity type query plus filter test + // Datasets + entityQuerySearch ("cypress","DATASET","Datasets"); + // Dashboards + entityQuerySearch ("cypress","DASHBOARD","Dashboards"); + // Pipelines + entityQuerySearch ("cypress","DATA_FLOW","Pipelines"); + // Domains + entityQuerySearch ("Marketing","DOMAIN","Domains"); + // Glossary Terms + entityQuerySearch ("cypress","GLOSSARY_TERM","Glossary Terms"); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/manage_access_tokens.js b/smoke-test/tests/cypress/cypress/e2e/settings/manage_access_tokens.js new file mode 100644 index 00000000000000..7a77c2b77df5b0 --- /dev/null +++ b/smoke-test/tests/cypress/cypress/e2e/settings/manage_access_tokens.js @@ -0,0 +1,43 @@ +import { aliasQuery, hasOperationName } from "../utils"; +const test_id = Math.floor(Math.random() * 100000); + +describe("manage access tokens", () => { + before(() => { + cy.intercept("POST", "/api/v2/graphql", (req) => { + aliasQuery(req, "appConfig"); + }); + }); + + const setTokenAuthEnabledFlag = (isOn) => { + cy.intercept("POST", "/api/v2/graphql", (req) => { + if (hasOperationName(req, "appConfig")) { + req.reply((res) => { + res.body.data.appConfig.authConfig.tokenAuthEnabled = isOn; + }); + } + }); + }; + + it("create and revoke access token", () => { + //create access token, verify token on ui + setTokenAuthEnabledFlag(true); + cy.loginWithCredentials(); + cy.goToAccessTokenSettings(); + cy.clickOptionWithTestId("add-token-button"); + cy.enterTextInTestId("create-access-token-name", "Token Name" + test_id); + cy.enterTextInTestId("create-access-token-description", "Token Description" + test_id); + cy.clickOptionWithTestId("create-access-token-button"); + cy.waitTextVisible("New Personal Access Token"); + cy.get('[data-testid="access-token-value"]').should("be.visible"); + cy.get('[data-testid="access-token-value"]').invoke('text').should('match', /^[a-zA-Z0-9-_]+\.[a-zA-Z0-9-_]+\.[a-zA-Z0-9-_]+$/); + cy.clickOptionWithTestId("access-token-modal-close-button"); + //revoke access token, verify token removed from ui + cy.waitTextVisible("Token Name" + test_id); + cy.waitTextVisible("Token Description" + test_id); + cy.clickOptionWithTestId("revoke-token-button"); + cy.waitTextVisible("Are you sure you want to revoke this token?"); + cy.clickOptionWithText("Yes"); + cy.ensureTextNotPresent("Token Name" + test_id); + cy.ensureTextNotPresent("Token Description" + test_id); + }); +}); \ No newline at end of file diff --git a/smoke-test/tests/cypress/cypress/support/commands.js b/smoke-test/tests/cypress/cypress/support/commands.js index 8bfe7305c001f5..5e3664f944edf1 100644 --- a/smoke-test/tests/cypress/cypress/support/commands.js +++ b/smoke-test/tests/cypress/cypress/support/commands.js @@ -66,6 +66,7 @@ Cypress.Commands.add("logout", () => { Cypress.Commands.add("goToGlossaryList", () => { cy.visit("/glossary"); cy.waitTextVisible("Glossary"); + cy.wait(3000); }); Cypress.Commands.add("goToDomainList", () => { @@ -84,6 +85,12 @@ Cypress.Commands.add("goToOwnershipTypesSettings", () => { cy.waitTextVisible("Manage Ownership"); }); +Cypress.Commands.add("goToAccessTokenSettings", () => { + cy.visit("/settings/tokens"); + cy.waitTextVisible("Manage Access Tokens"); + cy.wait(3000); +}); + Cypress.Commands.add("goToIngestionPage", () => { cy.visit("/ingestion"); cy.waitTextVisible("Manage Ingestion"); diff --git a/smoke-test/tests/cypress/integration_test.py b/smoke-test/tests/cypress/integration_test.py index b3bacf39ac7aec..4ad2bc53fa87d9 100644 --- a/smoke-test/tests/cypress/integration_test.py +++ b/smoke-test/tests/cypress/integration_test.py @@ -1,18 +1,16 @@ -from typing import Set, List - import datetime -import pytest -import subprocess import os +import subprocess +from typing import List, Set + +import pytest + +from tests.setup.lineage.ingest_time_lineage import (get_time_lineage_urns, + ingest_time_lineage) +from tests.utils import (create_datahub_step_state_aspects, delete_urns, + delete_urns_from_file, get_admin_username, + ingest_file_via_rest) -from tests.utils import ( - create_datahub_step_state_aspects, - get_admin_username, - ingest_file_via_rest, - delete_urns_from_file, - delete_urns, -) -from tests.setup.lineage.ingest_time_lineage import ingest_time_lineage, get_time_lineage_urns CYPRESS_TEST_DATA_DIR = "tests/cypress" TEST_DATA_FILENAME = "data.json" @@ -145,7 +143,6 @@ def ingest_cleanup_data(): delete_urns_from_file(f"{CYPRESS_TEST_DATA_DIR}/{TEST_ONBOARDING_DATA_FILENAME}") delete_urns(get_time_lineage_urns()) - print_now() print("deleting onboarding data file") if os.path.exists(f"{CYPRESS_TEST_DATA_DIR}/{TEST_ONBOARDING_DATA_FILENAME}"): diff --git a/smoke-test/tests/dataproduct/test_dataproduct.py b/smoke-test/tests/dataproduct/test_dataproduct.py index db198098f21fab..baef1cb1cb3ba0 100644 --- a/smoke-test/tests/dataproduct/test_dataproduct.py +++ b/smoke-test/tests/dataproduct/test_dataproduct.py @@ -1,4 +1,6 @@ +import logging import os +import subprocess import tempfile import time from random import randint @@ -17,8 +19,6 @@ DomainPropertiesClass, DomainsClass) from datahub.utilities.urns.urn import Urn -import subprocess -import logging logger = logging.getLogger(__name__) diff --git a/smoke-test/tests/delete/delete_test.py b/smoke-test/tests/delete/delete_test.py index 68e001f983fbf5..d920faaf3a89a4 100644 --- a/smoke-test/tests/delete/delete_test.py +++ b/smoke-test/tests/delete/delete_test.py @@ -1,16 +1,14 @@ -import os import json -import pytest +import os from time import sleep + +import pytest from datahub.cli.cli_utils import get_aspects_for_entity from datahub.cli.ingest_cli import get_session_and_host -from tests.utils import ( - ingest_file_via_rest, - wait_for_healthcheck_util, - delete_urns_from_file, - wait_for_writes_to_sync, - get_datahub_graph, -) + +from tests.utils import (delete_urns_from_file, get_datahub_graph, + ingest_file_via_rest, wait_for_healthcheck_util, + wait_for_writes_to_sync) # Disable telemetry os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" @@ -102,7 +100,7 @@ def test_delete_reference(test_setup, depends=["test_healthchecks"]): graph.delete_references_to_urn(tag_urn, dry_run=False) wait_for_writes_to_sync() - + # Validate that references no longer exist references_count, related_aspects = graph.delete_references_to_urn( tag_urn, dry_run=True diff --git a/smoke-test/tests/deprecation/deprecation_test.py b/smoke-test/tests/deprecation/deprecation_test.py index 1149a970aa8e5e..a8969804d03d7b 100644 --- a/smoke-test/tests/deprecation/deprecation_test.py +++ b/smoke-test/tests/deprecation/deprecation_test.py @@ -1,10 +1,7 @@ import pytest -from tests.utils import ( - delete_urns_from_file, - get_frontend_url, - ingest_file_via_rest, - get_root_urn, -) + +from tests.utils import (delete_urns_from_file, get_frontend_url, get_root_urn, + ingest_file_via_rest) @pytest.fixture(scope="module", autouse=True) diff --git a/smoke-test/tests/domains/domains_test.py b/smoke-test/tests/domains/domains_test.py index 7ffe1682cafd89..fa8c918e3cbe16 100644 --- a/smoke-test/tests/domains/domains_test.py +++ b/smoke-test/tests/domains/domains_test.py @@ -1,12 +1,8 @@ import pytest import tenacity -from tests.utils import ( - delete_urns_from_file, - get_frontend_url, - get_gms_url, - ingest_file_via_rest, - get_sleep_info, -) + +from tests.utils import (delete_urns_from_file, get_frontend_url, get_gms_url, + get_sleep_info, ingest_file_via_rest) sleep_sec, sleep_times = get_sleep_info() @@ -240,4 +236,7 @@ def test_set_unset_domain(frontend_session, ingest_cleanup_data): assert res_data assert res_data["data"]["dataset"]["domain"]["domain"]["urn"] == domain_urn - assert res_data["data"]["dataset"]["domain"]["domain"]["properties"]["name"] == "Engineering" + assert ( + res_data["data"]["dataset"]["domain"]["domain"]["properties"]["name"] + == "Engineering" + ) diff --git a/smoke-test/tests/managed-ingestion/managed_ingestion_test.py b/smoke-test/tests/managed-ingestion/managed_ingestion_test.py index 1238a1dd5730aa..b5e408731334e1 100644 --- a/smoke-test/tests/managed-ingestion/managed_ingestion_test.py +++ b/smoke-test/tests/managed-ingestion/managed_ingestion_test.py @@ -3,7 +3,8 @@ import pytest import tenacity -from tests.utils import get_frontend_url, get_sleep_info, wait_for_healthcheck_util +from tests.utils import (get_frontend_url, get_sleep_info, + wait_for_healthcheck_util) sleep_sec, sleep_times = get_sleep_info() diff --git a/smoke-test/tests/patch/common_patch_tests.py b/smoke-test/tests/patch/common_patch_tests.py index 574e4fd4e4c886..f1d6abf5da794c 100644 --- a/smoke-test/tests/patch/common_patch_tests.py +++ b/smoke-test/tests/patch/common_patch_tests.py @@ -2,25 +2,17 @@ import uuid from typing import Dict, Optional, Type -from datahub.emitter.mce_builder import ( - make_tag_urn, - make_term_urn, - make_user_urn, -) +from datahub.emitter.mce_builder import (make_tag_urn, make_term_urn, + make_user_urn) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_patch_builder import MetadataPatchProposal from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig -from datahub.metadata.schema_classes import ( - AuditStampClass, - GlobalTagsClass, - GlossaryTermAssociationClass, - GlossaryTermsClass, - OwnerClass, - OwnershipClass, - OwnershipTypeClass, - TagAssociationClass, - _Aspect, -) +from datahub.metadata.schema_classes import (AuditStampClass, GlobalTagsClass, + GlossaryTermAssociationClass, + GlossaryTermsClass, OwnerClass, + OwnershipClass, + OwnershipTypeClass, + TagAssociationClass, _Aspect) def helper_test_entity_terms_patch( @@ -34,18 +26,14 @@ def get_terms(graph, entity_urn): term_urn = make_term_urn(term=f"testTerm-{uuid.uuid4()}") - term_association = GlossaryTermAssociationClass( - urn=term_urn, context="test" - ) + term_association = GlossaryTermAssociationClass(urn=term_urn, context="test") global_terms = GlossaryTermsClass( terms=[term_association], auditStamp=AuditStampClass( time=int(time.time() * 1000.0), actor=make_user_urn("tester") ), ) - mcpw = MetadataChangeProposalWrapper( - entityUrn=test_entity_urn, aspect=global_terms - ) + mcpw = MetadataChangeProposalWrapper(entityUrn=test_entity_urn, aspect=global_terms) with DataHubGraph(DataHubGraphConfig()) as graph: graph.emit_mcp(mcpw) @@ -88,9 +76,7 @@ def helper_test_dataset_tags_patch( tag_association = TagAssociationClass(tag=tag_urn, context="test") global_tags = GlobalTagsClass(tags=[tag_association]) - mcpw = MetadataChangeProposalWrapper( - entityUrn=test_entity_urn, aspect=global_tags - ) + mcpw = MetadataChangeProposalWrapper(entityUrn=test_entity_urn, aspect=global_tags) with DataHubGraph(DataHubGraphConfig()) as graph: graph.emit_mcp(mcpw) @@ -153,15 +139,11 @@ def helper_test_ownership_patch( assert owner.owners[0].owner == make_user_urn("jdoe") for patch_mcp in ( - patch_builder_class(test_entity_urn) - .add_owner(owner_to_add) - .build() + patch_builder_class(test_entity_urn).add_owner(owner_to_add).build() ): graph.emit_mcp(patch_mcp) - owner = graph.get_aspect( - entity_urn=test_entity_urn, aspect_type=OwnershipClass - ) + owner = graph.get_aspect(entity_urn=test_entity_urn, aspect_type=OwnershipClass) assert len(owner.owners) == 2 for patch_mcp in ( @@ -171,9 +153,7 @@ def helper_test_ownership_patch( ): graph.emit_mcp(patch_mcp) - owner = graph.get_aspect( - entity_urn=test_entity_urn, aspect_type=OwnershipClass - ) + owner = graph.get_aspect(entity_urn=test_entity_urn, aspect_type=OwnershipClass) assert len(owner.owners) == 1 assert owner.owners[0].owner == make_user_urn("jdoe") @@ -199,9 +179,7 @@ def get_custom_properties( orig_aspect = base_aspect assert hasattr(orig_aspect, "customProperties") orig_aspect.customProperties = base_property_map - mcpw = MetadataChangeProposalWrapper( - entityUrn=test_entity_urn, aspect=orig_aspect - ) + mcpw = MetadataChangeProposalWrapper(entityUrn=test_entity_urn, aspect=orig_aspect) with DataHubGraph(DataHubGraphConfig()) as graph: graph.emit(mcpw) diff --git a/smoke-test/tests/patch/test_datajob_patches.py b/smoke-test/tests/patch/test_datajob_patches.py index 407410ee89914d..342d5d683228a7 100644 --- a/smoke-test/tests/patch/test_datajob_patches.py +++ b/smoke-test/tests/patch/test_datajob_patches.py @@ -3,19 +3,14 @@ from datahub.emitter.mce_builder import make_data_job_urn, make_dataset_urn from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.graph.client import DataHubGraph, DataHubGraphConfig -from datahub.metadata.schema_classes import ( - DataJobInfoClass, - DataJobInputOutputClass, - EdgeClass, -) +from datahub.metadata.schema_classes import (DataJobInfoClass, + DataJobInputOutputClass, + EdgeClass) from datahub.specific.datajob import DataJobPatchBuilder from tests.patch.common_patch_tests import ( - helper_test_custom_properties_patch, - helper_test_dataset_tags_patch, - helper_test_entity_terms_patch, - helper_test_ownership_patch, -) + helper_test_custom_properties_patch, helper_test_dataset_tags_patch, + helper_test_entity_terms_patch, helper_test_ownership_patch) def _make_test_datajob_urn( @@ -37,16 +32,12 @@ def test_datajob_ownership_patch(wait_for_healthchecks): # Tags def test_datajob_tags_patch(wait_for_healthchecks): - helper_test_dataset_tags_patch( - _make_test_datajob_urn(), DataJobPatchBuilder - ) + helper_test_dataset_tags_patch(_make_test_datajob_urn(), DataJobPatchBuilder) # Terms def test_dataset_terms_patch(wait_for_healthchecks): - helper_test_entity_terms_patch( - _make_test_datajob_urn(), DataJobPatchBuilder - ) + helper_test_entity_terms_patch(_make_test_datajob_urn(), DataJobPatchBuilder) # Custom Properties diff --git a/smoke-test/tests/patch/test_dataset_patches.py b/smoke-test/tests/patch/test_dataset_patches.py index 239aab64675d8d..6704d19760fb9a 100644 --- a/smoke-test/tests/patch/test_dataset_patches.py +++ b/smoke-test/tests/patch/test_dataset_patches.py @@ -20,7 +20,10 @@ UpstreamClass, UpstreamLineageClass) from datahub.specific.dataset import DatasetPatchBuilder -from tests.patch.common_patch_tests import helper_test_entity_terms_patch, helper_test_dataset_tags_patch, helper_test_ownership_patch, helper_test_custom_properties_patch + +from tests.patch.common_patch_tests import ( + helper_test_custom_properties_patch, helper_test_dataset_tags_patch, + helper_test_entity_terms_patch, helper_test_ownership_patch) # Common Aspect Patch Tests @@ -31,6 +34,7 @@ def test_dataset_ownership_patch(wait_for_healthchecks): ) helper_test_ownership_patch(dataset_urn, DatasetPatchBuilder) + # Tags def test_dataset_tags_patch(wait_for_healthchecks): dataset_urn = make_dataset_urn( @@ -38,6 +42,7 @@ def test_dataset_tags_patch(wait_for_healthchecks): ) helper_test_dataset_tags_patch(dataset_urn, DatasetPatchBuilder) + # Terms def test_dataset_terms_patch(wait_for_healthchecks): dataset_urn = make_dataset_urn( @@ -284,8 +289,15 @@ def test_custom_properties_patch(wait_for_healthchecks): dataset_urn = make_dataset_urn( platform="hive", name=f"SampleHiveDataset-{uuid.uuid4()}", env="PROD" ) - orig_dataset_properties = DatasetPropertiesClass(name="test_name", description="test_description") - helper_test_custom_properties_patch(test_entity_urn=dataset_urn, patch_builder_class=DatasetPatchBuilder, custom_properties_aspect_class=DatasetPropertiesClass, base_aspect=orig_dataset_properties) + orig_dataset_properties = DatasetPropertiesClass( + name="test_name", description="test_description" + ) + helper_test_custom_properties_patch( + test_entity_urn=dataset_urn, + patch_builder_class=DatasetPatchBuilder, + custom_properties_aspect_class=DatasetPropertiesClass, + base_aspect=orig_dataset_properties, + ) with DataHubGraph(DataHubGraphConfig()) as graph: # Patch custom properties along with name diff --git a/smoke-test/tests/policies/test_policies.py b/smoke-test/tests/policies/test_policies.py index b7091541894ddd..67142181d2b960 100644 --- a/smoke-test/tests/policies/test_policies.py +++ b/smoke-test/tests/policies/test_policies.py @@ -1,12 +1,8 @@ import pytest import tenacity -from tests.utils import ( - get_frontend_url, - wait_for_healthcheck_util, - get_frontend_session, - get_sleep_info, - get_root_urn, -) + +from tests.utils import (get_frontend_session, get_frontend_url, get_root_urn, + get_sleep_info, wait_for_healthcheck_util) TEST_POLICY_NAME = "Updated Platform Policy" diff --git a/smoke-test/tests/privileges/test_privileges.py b/smoke-test/tests/privileges/test_privileges.py new file mode 100644 index 00000000000000..13d6b6cf3415a4 --- /dev/null +++ b/smoke-test/tests/privileges/test_privileges.py @@ -0,0 +1,241 @@ +import pytest +import tenacity + +from tests.utils import (get_frontend_session, wait_for_writes_to_sync, wait_for_healthcheck_util, + get_frontend_url, get_admin_credentials,get_sleep_info) +from tests.privileges.utils import * + +sleep_sec, sleep_times = get_sleep_info() + +@pytest.fixture(scope="session") +def wait_for_healthchecks(): + wait_for_healthcheck_util() + yield + + +@pytest.mark.dependency() +def test_healthchecks(wait_for_healthchecks): + # Call to wait_for_healthchecks fixture will do the actual functionality. + pass + + +@pytest.fixture(scope="session") +def admin_session(wait_for_healthchecks): + yield get_frontend_session() + + +@pytest.mark.dependency(depends=["test_healthchecks"]) +@pytest.fixture(scope="module", autouse=True) +def privileges_and_test_user_setup(admin_session): + """Fixture to execute setup before and tear down after all tests are run""" + # Disable 'All users' privileges + set_base_platform_privileges_policy_status("INACTIVE", admin_session) + set_view_dataset_sensitive_info_policy_status("INACTIVE", admin_session) + set_view_entity_profile_privileges_policy_status("INACTIVE", admin_session) + # Sleep for eventual consistency + wait_for_writes_to_sync() + + # Create a new user + admin_session = create_user(admin_session, "user", "user") + + yield + + # Remove test user + remove_user(admin_session, "urn:li:corpuser:user") + + # Restore All users privileges + set_base_platform_privileges_policy_status("ACTIVE", admin_session) + set_view_dataset_sensitive_info_policy_status("ACTIVE", admin_session) + set_view_entity_profile_privileges_policy_status("ACTIVE", admin_session) + + # Sleep for eventual consistency + wait_for_writes_to_sync() + + +@tenacity.retry( + stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec) +) +def _ensure_can_create_secret(session, json, urn): + create_secret_success = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=json) + create_secret_success.raise_for_status() + secret_data = create_secret_success.json() + + assert secret_data + assert secret_data["data"] + assert secret_data["data"]["createSecret"] + assert secret_data["data"]["createSecret"] == urn + + +@tenacity.retry( + stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec) +) +def _ensure_cant_create_secret(session, json): + create_secret_response = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=json) + create_secret_response.raise_for_status() + create_secret_data = create_secret_response.json() + + assert create_secret_data["errors"][0]["extensions"]["code"] == 403 + assert create_secret_data["errors"][0]["extensions"]["type"] == "UNAUTHORIZED" + assert create_secret_data["data"]["createSecret"] == None + + +@tenacity.retry( + stop=tenacity.stop_after_attempt(10), wait=tenacity.wait_fixed(sleep_sec) +) +def _ensure_can_create_ingestion_source(session, json): + create_ingestion_success = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=json) + create_ingestion_success.raise_for_status() + ingestion_data = create_ingestion_success.json() + + assert ingestion_data + assert ingestion_data["data"] + assert ingestion_data["data"]["createIngestionSource"] + assert ingestion_data["data"]["createIngestionSource"] is not None + + return ingestion_data["data"]["createIngestionSource"] + + +@tenacity.retry( + stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec) +) +def _ensure_cant_create_ingestion_source(session, json): + create_source_response = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=json) + create_source_response.raise_for_status() + create_source_data = create_source_response.json() + + assert create_source_data["errors"][0]["extensions"]["code"] == 403 + assert create_source_data["errors"][0]["extensions"]["type"] == "UNAUTHORIZED" + assert create_source_data["data"]["createIngestionSource"] == None + + +@pytest.mark.dependency(depends=["test_healthchecks"]) +def test_privilege_to_create_and_manage_secrets(): + + (admin_user, admin_pass) = get_admin_credentials() + admin_session = login_as(admin_user, admin_pass) + user_session = login_as("user", "user") + secret_urn = "urn:li:dataHubSecret:TestSecretName" + + # Verify new user can't create secrets + create_secret = { + "query": """mutation createSecret($input: CreateSecretInput!) {\n + createSecret(input: $input)\n}""", + "variables": { + "input":{ + "name":"TestSecretName", + "value":"Test Secret Value", + "description":"Test Secret Description" + } + }, + } + _ensure_cant_create_secret(user_session, create_secret) + + + # Assign privileges to the new user to manage secrets + policy_urn = create_user_policy("urn:li:corpuser:user", ["MANAGE_SECRETS"], admin_session) + + # Verify new user can create and manage secrets + # Create a secret + _ensure_can_create_secret(user_session, create_secret, secret_urn) + + + # Remove a secret + remove_secret = { + "query": """mutation deleteSecret($urn: String!) {\n + deleteSecret(urn: $urn)\n}""", + "variables": { + "urn": secret_urn + }, + } + + remove_secret_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=remove_secret) + remove_secret_response.raise_for_status() + secret_data = remove_secret_response.json() + + assert secret_data + assert secret_data["data"] + assert secret_data["data"]["deleteSecret"] + assert secret_data["data"]["deleteSecret"] == secret_urn + + + # Remove the policy + remove_policy(policy_urn, admin_session) + + # Ensure user can't create secret after policy is removed + _ensure_cant_create_secret(user_session, create_secret) + + +@pytest.mark.dependency(depends=["test_healthchecks"]) +def test_privilege_to_create_and_manage_ingestion_source(): + + (admin_user, admin_pass) = get_admin_credentials() + admin_session = login_as(admin_user, admin_pass) + user_session = login_as("user", "user") + + # Verify new user can't create ingestion source + create_ingestion_source = { + "query": """mutation createIngestionSource($input: UpdateIngestionSourceInput!) {\n + createIngestionSource(input: $input)\n}""", + "variables": {"input":{"type":"snowflake","name":"test","config": + {"recipe": + "{\"source\":{\"type\":\"snowflake\",\"config\":{\"account_id\":null,\"include_table_lineage\":true,\"include_view_lineage\":true,\"include_tables\":true,\"include_views\":true,\"profiling\":{\"enabled\":true,\"profile_table_level_only\":true},\"stateful_ingestion\":{\"enabled\":true}}}}", + "executorId":"default","debugMode":False,"extraArgs":[]}}}, + } + + _ensure_cant_create_ingestion_source(user_session, create_ingestion_source) + + + # Assign privileges to the new user to manage ingestion source + policy_urn = create_user_policy("urn:li:corpuser:user", ["MANAGE_INGESTION"], admin_session) + + # Verify new user can create and manage ingestion source(edit, delete) + ingestion_source_urn = _ensure_can_create_ingestion_source(user_session, create_ingestion_source) + + # Edit ingestion source + update_ingestion_source = { + "query": """mutation updateIngestionSource($urn: String!, $input: UpdateIngestionSourceInput!) {\n + updateIngestionSource(urn: $urn, input: $input)\n}""", + "variables": {"urn":ingestion_source_urn, + "input":{"type":"snowflake","name":"test updated", + "config":{"recipe":"{\"source\":{\"type\":\"snowflake\",\"config\":{\"account_id\":null,\"include_table_lineage\":true,\"include_view_lineage\":true,\"include_tables\":true,\"include_views\":true,\"profiling\":{\"enabled\":true,\"profile_table_level_only\":true},\"stateful_ingestion\":{\"enabled\":true}}}}", + "executorId":"default","debugMode":False,"extraArgs":[]}}} + } + + update_ingestion_success = user_session.post( + f"{get_frontend_url()}/api/v2/graphql", json=update_ingestion_source) + update_ingestion_success.raise_for_status() + ingestion_data = update_ingestion_success.json() + + assert ingestion_data + assert ingestion_data["data"] + assert ingestion_data["data"]["updateIngestionSource"] + assert ingestion_data["data"]["updateIngestionSource"] == ingestion_source_urn + + + # Delete ingestion source + remove_ingestion_source = { + "query": """mutation deleteIngestionSource($urn: String!) {\n + deleteIngestionSource(urn: $urn)\n}""", + "variables": { + "urn": ingestion_source_urn + }, + } + + remove_ingestion_response = user_session.post(f"{get_frontend_url()}/api/v2/graphql", json=remove_ingestion_source) + remove_ingestion_response.raise_for_status() + ingestion_data = remove_ingestion_response.json() + + assert ingestion_data + assert ingestion_data["data"] + assert ingestion_data["data"]["deleteIngestionSource"] + assert ingestion_data["data"]["deleteIngestionSource"] == ingestion_source_urn + + # Remove the policy + remove_policy(policy_urn, admin_session) + + # Ensure that user can't create ingestion source after policy is removed + _ensure_cant_create_ingestion_source(user_session, create_ingestion_source) \ No newline at end of file diff --git a/smoke-test/tests/privileges/utils.py b/smoke-test/tests/privileges/utils.py new file mode 100644 index 00000000000000..ea1f565f6f5acd --- /dev/null +++ b/smoke-test/tests/privileges/utils.py @@ -0,0 +1,218 @@ +import requests_wrapper as requests +from tests.consistency_utils import wait_for_writes_to_sync +from tests.utils import (get_frontend_url, wait_for_writes_to_sync, get_admin_credentials) + + +def set_base_platform_privileges_policy_status(status, session): + base_platform_privileges = { + "query": """mutation updatePolicy($urn: String!, $input: PolicyUpdateInput!) {\n + updatePolicy(urn: $urn, input: $input) }""", + "variables": { + "urn": "urn:li:dataHubPolicy:7", + "input": { + "type": "PLATFORM", + "state": status, + "name": "All Users - Base Platform Privileges", + "description": "Grants base platform privileges to ALL users of DataHub. Change this policy to alter that behavior.", + "privileges": ["MANAGE_INGESTION", + "MANAGE_SECRETS", + "MANAGE_USERS_AND_GROUPS", + "VIEW_ANALYTICS", + "GENERATE_PERSONAL_ACCESS_TOKENS", + "MANAGE_DOMAINS", + "MANAGE_GLOBAL_ANNOUNCEMENTS", + "MANAGE_TESTS", + "MANAGE_GLOSSARIES", + "MANAGE_TAGS", + "MANAGE_GLOBAL_VIEWS", + "MANAGE_GLOBAL_OWNERSHIP_TYPES"], + "actors": { + "users": [], + "groups": None, + "resourceOwners": False, + "allUsers": True, + "allGroups": False, + "resourceOwnersTypes": None, + }, + }, + }, + } + base_privileges_response = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=base_platform_privileges) + base_privileges_response.raise_for_status() + base_res_data = base_privileges_response.json() + assert base_res_data["data"]["updatePolicy"] == "urn:li:dataHubPolicy:7" + +def set_view_dataset_sensitive_info_policy_status(status, session): + dataset_sensitive_information = { + "query": """mutation updatePolicy($urn: String!, $input: PolicyUpdateInput!) {\n + updatePolicy(urn: $urn, input: $input) }""", + "variables": { + "urn": "urn:li:dataHubPolicy:view-dataset-sensitive", + "input": { + "type": "METADATA", + "state": status, + "name": "All Users - View Dataset Sensitive Information", + "description": "Grants viewing privileges of usage and profile information of all datasets for all users", + "privileges": ["VIEW_DATASET_USAGE","VIEW_DATASET_PROFILE"], + "actors": { + "users": [], + "groups": None, + "resourceOwners": False, + "allUsers": True, + "allGroups": False, + "resourceOwnersTypes": None, + }, + }, + }, + } + sensitive_info_response = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=dataset_sensitive_information) + sensitive_info_response.raise_for_status() + sens_info_data = sensitive_info_response.json() + assert sens_info_data["data"]["updatePolicy"] == "urn:li:dataHubPolicy:view-dataset-sensitive" + +def set_view_entity_profile_privileges_policy_status(status, session): + view_entity_page = { + "query": """mutation updatePolicy($urn: String!, $input: PolicyUpdateInput!) {\n + updatePolicy(urn: $urn, input: $input) }""", + "variables": { + "urn": "urn:li:dataHubPolicy:view-entity-page-all", + "input": { + "type": "METADATA", + "state": status, + "name": "All Users - View Entity Page", + "description": "Grants entity view to all users", + "privileges": ["VIEW_ENTITY_PAGE", + "SEARCH_PRIVILEGE", + "GET_COUNTS_PRIVILEGE", + "GET_TIMESERIES_ASPECT_PRIVILEGE", + "GET_ENTITY_PRIVILEGE", + "GET_TIMELINE_PRIVILEGE"], + "actors": { + "users": [], + "groups": None, + "resourceOwners": False, + "allUsers": True, + "allGroups": False, + "resourceOwnersTypes": None, + }, + }, + }, + } + view_entity_response = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=view_entity_page) + view_entity_response.raise_for_status() + view_entity_data = view_entity_response.json() + assert view_entity_data["data"]["updatePolicy"] == "urn:li:dataHubPolicy:view-entity-page-all" + +def create_user(session, email, password): + # Remove user if exists + res_data = remove_user(session, f"urn:li:corpuser:{email}") + assert res_data + assert "error" not in res_data + # Get the invite token + get_invite_token_json = { + "query": """query getInviteToken($input: GetInviteTokenInput!) {\n + getInviteToken(input: $input){\n + inviteToken\n + }\n + }""", + "variables": {"input": {}}, + } + get_invite_token_response = session.post( + f"{get_frontend_url()}/api/v2/graphql", json=get_invite_token_json + ) + get_invite_token_response.raise_for_status() + get_invite_token_res_data = get_invite_token_response.json() + invite_token = get_invite_token_res_data["data"]["getInviteToken"]["inviteToken"] + assert invite_token is not None + assert "error" not in invite_token + # Create a new user using the invite token + sign_up_json = { + "fullName": "Test User", + "email": email, + "password": password, + "title": "Data Engineer", + "inviteToken": invite_token, + } + sign_up_response = session.post( + f"{get_frontend_url()}/signUp", json=sign_up_json + ) + sign_up_response.raise_for_status() + assert sign_up_response + assert "error" not in sign_up_response + wait_for_writes_to_sync() + session.cookies.clear() + (admin_user, admin_pass) = get_admin_credentials() + admin_session = login_as(admin_user, admin_pass) + return admin_session + + +def login_as(username, password): + session = requests.Session() + headers = { + "Content-Type": "application/json", + } + data = '{"username":"' + username + '", "password":"' + password + '"}' + response = session.post(f"{get_frontend_url()}/logIn", headers=headers, data=data) + response.raise_for_status() + return session + +def remove_user(session, urn): + json = { + "query": """mutation removeUser($urn: String!) {\n + removeUser(urn: $urn) + }""", + "variables": {"urn": urn}, + } + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=json) + response.raise_for_status() + return response.json() + +def create_user_policy(user_urn, privileges, session): + policy = { + "query": """mutation createPolicy($input: PolicyUpdateInput!) {\n + createPolicy(input: $input) }""", + "variables": { + "input": { + "type": "PLATFORM", + "name": "Policy Name", + "description": "Policy Description", + "state": "ACTIVE", + "resources": {"filter":{"criteria":[]}}, + "privileges": privileges, + "actors": { + "users": [user_urn], + "resourceOwners": False, + "allUsers": False, + "allGroups": False, + }, + } + }, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=policy) + response.raise_for_status() + res_data = response.json() + + assert res_data + assert res_data["data"] + assert res_data["data"]["createPolicy"] + return res_data["data"]["createPolicy"] + +def remove_policy(urn, session): + remove_policy_json = { + "query": """mutation deletePolicy($urn: String!) {\n + deletePolicy(urn: $urn) }""", + "variables": {"urn": urn}, + } + + response = session.post(f"{get_frontend_url()}/api/v2/graphql", json=remove_policy_json) + response.raise_for_status() + res_data = response.json() + + assert res_data + assert res_data["data"] + assert res_data["data"]["deletePolicy"] + assert res_data["data"]["deletePolicy"] == urn \ No newline at end of file diff --git a/smoke-test/tests/setup/lineage/helper_classes.py b/smoke-test/tests/setup/lineage/helper_classes.py index 53f77b08d15edd..d550f3093be85c 100644 --- a/smoke-test/tests/setup/lineage/helper_classes.py +++ b/smoke-test/tests/setup/lineage/helper_classes.py @@ -1,10 +1,7 @@ from dataclasses import dataclass from typing import Any, Dict, List, Optional -from datahub.metadata.schema_classes import ( - EdgeClass, - SchemaFieldDataTypeClass, -) +from datahub.metadata.schema_classes import EdgeClass, SchemaFieldDataTypeClass @dataclass diff --git a/smoke-test/tests/setup/lineage/ingest_data_job_change.py b/smoke-test/tests/setup/lineage/ingest_data_job_change.py index 8e3e9c53529221..588a1625419bc2 100644 --- a/smoke-test/tests/setup/lineage/ingest_data_job_change.py +++ b/smoke-test/tests/setup/lineage/ingest_data_job_change.py @@ -1,36 +1,20 @@ from typing import List -from datahub.emitter.mce_builder import ( - make_dataset_urn, - make_data_flow_urn, - make_data_job_urn_with_flow, -) +from datahub.emitter.mce_builder import (make_data_flow_urn, + make_data_job_urn_with_flow, + make_dataset_urn) from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.metadata.schema_classes import ( - DateTypeClass, - NumberTypeClass, - SchemaFieldDataTypeClass, - StringTypeClass, -) +from datahub.metadata.schema_classes import (DateTypeClass, NumberTypeClass, + SchemaFieldDataTypeClass, + StringTypeClass) -from tests.setup.lineage.constants import ( - AIRFLOW_DATA_PLATFORM, - SNOWFLAKE_DATA_PLATFORM, - TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, - TIMESTAMP_MILLIS_ONE_DAY_AGO, -) -from tests.setup.lineage.helper_classes import ( - Field, - Dataset, - Task, - Pipeline, -) -from tests.setup.lineage.utils import ( - create_edge, - create_node, - create_nodes_and_edges, - emit_mcps, -) +from tests.setup.lineage.constants import (AIRFLOW_DATA_PLATFORM, + SNOWFLAKE_DATA_PLATFORM, + TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, + TIMESTAMP_MILLIS_ONE_DAY_AGO) +from tests.setup.lineage.helper_classes import Dataset, Field, Pipeline, Task +from tests.setup.lineage.utils import (create_edge, create_node, + create_nodes_and_edges, emit_mcps) # Constants for Case 2 DAILY_TEMPERATURE_DATASET_ID = "climate.daily_temperature" diff --git a/smoke-test/tests/setup/lineage/ingest_dataset_join_change.py b/smoke-test/tests/setup/lineage/ingest_dataset_join_change.py index 35a8e6d5cf02ea..bb9f51b6b5e9b7 100644 --- a/smoke-test/tests/setup/lineage/ingest_dataset_join_change.py +++ b/smoke-test/tests/setup/lineage/ingest_dataset_join_change.py @@ -1,32 +1,18 @@ from typing import List -from datahub.emitter.mce_builder import ( - make_dataset_urn, -) +from datahub.emitter.mce_builder import make_dataset_urn from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.metadata.schema_classes import ( - NumberTypeClass, - SchemaFieldDataTypeClass, - StringTypeClass, - UpstreamClass, -) +from datahub.metadata.schema_classes import (NumberTypeClass, + SchemaFieldDataTypeClass, + StringTypeClass, UpstreamClass) -from tests.setup.lineage.constants import ( - DATASET_ENTITY_TYPE, - SNOWFLAKE_DATA_PLATFORM, - TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, - TIMESTAMP_MILLIS_ONE_DAY_AGO, -) -from tests.setup.lineage.helper_classes import ( - Field, - Dataset, -) -from tests.setup.lineage.utils import ( - create_node, - create_upstream_edge, - create_upstream_mcp, - emit_mcps, -) +from tests.setup.lineage.constants import (DATASET_ENTITY_TYPE, + SNOWFLAKE_DATA_PLATFORM, + TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, + TIMESTAMP_MILLIS_ONE_DAY_AGO) +from tests.setup.lineage.helper_classes import Dataset, Field +from tests.setup.lineage.utils import (create_node, create_upstream_edge, + create_upstream_mcp, emit_mcps) # Constants for Case 3 GDP_DATASET_ID = "economic_data.gdp" diff --git a/smoke-test/tests/setup/lineage/ingest_input_datasets_change.py b/smoke-test/tests/setup/lineage/ingest_input_datasets_change.py index f4fb7951474780..6079d7a3d2b63b 100644 --- a/smoke-test/tests/setup/lineage/ingest_input_datasets_change.py +++ b/smoke-test/tests/setup/lineage/ingest_input_datasets_change.py @@ -1,36 +1,20 @@ from typing import List -from datahub.emitter.mce_builder import ( - make_dataset_urn, - make_data_flow_urn, - make_data_job_urn_with_flow, -) +from datahub.emitter.mce_builder import (make_data_flow_urn, + make_data_job_urn_with_flow, + make_dataset_urn) from datahub.emitter.rest_emitter import DatahubRestEmitter -from datahub.metadata.schema_classes import ( - NumberTypeClass, - SchemaFieldDataTypeClass, - StringTypeClass, -) - -from tests.setup.lineage.constants import ( - AIRFLOW_DATA_PLATFORM, - BQ_DATA_PLATFORM, - TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, - TIMESTAMP_MILLIS_ONE_DAY_AGO, -) -from tests.setup.lineage.helper_classes import ( - Field, - Dataset, - Task, - Pipeline, -) -from tests.setup.lineage.utils import ( - create_edge, - create_node, - create_nodes_and_edges, - emit_mcps, -) +from datahub.metadata.schema_classes import (NumberTypeClass, + SchemaFieldDataTypeClass, + StringTypeClass) +from tests.setup.lineage.constants import (AIRFLOW_DATA_PLATFORM, + BQ_DATA_PLATFORM, + TIMESTAMP_MILLIS_EIGHT_DAYS_AGO, + TIMESTAMP_MILLIS_ONE_DAY_AGO) +from tests.setup.lineage.helper_classes import Dataset, Field, Pipeline, Task +from tests.setup.lineage.utils import (create_edge, create_node, + create_nodes_and_edges, emit_mcps) # Constants for Case 1 TRANSACTIONS_DATASET_ID = "transactions.transactions" diff --git a/smoke-test/tests/setup/lineage/ingest_time_lineage.py b/smoke-test/tests/setup/lineage/ingest_time_lineage.py index cae8e0124d5018..3aec979707290d 100644 --- a/smoke-test/tests/setup/lineage/ingest_time_lineage.py +++ b/smoke-test/tests/setup/lineage/ingest_time_lineage.py @@ -1,12 +1,14 @@ +import os from typing import List from datahub.emitter.rest_emitter import DatahubRestEmitter -from tests.setup.lineage.ingest_input_datasets_change import ingest_input_datasets_change, get_input_datasets_change_urns -from tests.setup.lineage.ingest_data_job_change import ingest_data_job_change, get_data_job_change_urns -from tests.setup.lineage.ingest_dataset_join_change import ingest_dataset_join_change, get_dataset_join_change_urns - -import os +from tests.setup.lineage.ingest_data_job_change import ( + get_data_job_change_urns, ingest_data_job_change) +from tests.setup.lineage.ingest_dataset_join_change import ( + get_dataset_join_change_urns, ingest_dataset_join_change) +from tests.setup.lineage.ingest_input_datasets_change import ( + get_input_datasets_change_urns, ingest_input_datasets_change) SERVER = os.getenv("DATAHUB_SERVER") or "http://localhost:8080" TOKEN = os.getenv("DATAHUB_TOKEN") or "" @@ -20,4 +22,8 @@ def ingest_time_lineage() -> None: def get_time_lineage_urns() -> List[str]: - return get_input_datasets_change_urns() + get_data_job_change_urns() + get_dataset_join_change_urns() + return ( + get_input_datasets_change_urns() + + get_data_job_change_urns() + + get_dataset_join_change_urns() + ) diff --git a/smoke-test/tests/setup/lineage/utils.py b/smoke-test/tests/setup/lineage/utils.py index 672f7a945a6af0..c72f6ccb89b7ad 100644 --- a/smoke-test/tests/setup/lineage/utils.py +++ b/smoke-test/tests/setup/lineage/utils.py @@ -1,41 +1,30 @@ import datetime -from datahub.emitter.mce_builder import ( - make_data_platform_urn, - make_dataset_urn, - make_data_job_urn_with_flow, - make_data_flow_urn, -) +from typing import List + +from datahub.emitter.mce_builder import (make_data_flow_urn, + make_data_job_urn_with_flow, + make_data_platform_urn, + make_dataset_urn) from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.rest_emitter import DatahubRestEmitter from datahub.metadata.com.linkedin.pegasus2avro.dataset import UpstreamLineage -from datahub.metadata.schema_classes import ( - AuditStampClass, - ChangeTypeClass, - DatasetLineageTypeClass, - DatasetPropertiesClass, - DataFlowInfoClass, - DataJobInputOutputClass, - DataJobInfoClass, - EdgeClass, - MySqlDDLClass, - SchemaFieldClass, - SchemaMetadataClass, - UpstreamClass, -) -from typing import List - -from tests.setup.lineage.constants import ( - DATASET_ENTITY_TYPE, - DATA_JOB_ENTITY_TYPE, - DATA_FLOW_ENTITY_TYPE, - DATA_FLOW_INFO_ASPECT_NAME, - DATA_JOB_INFO_ASPECT_NAME, - DATA_JOB_INPUT_OUTPUT_ASPECT_NAME, -) -from tests.setup.lineage.helper_classes import ( - Dataset, - Pipeline, -) +from datahub.metadata.schema_classes import (AuditStampClass, ChangeTypeClass, + DataFlowInfoClass, + DataJobInfoClass, + DataJobInputOutputClass, + DatasetLineageTypeClass, + DatasetPropertiesClass, EdgeClass, + MySqlDDLClass, SchemaFieldClass, + SchemaMetadataClass, + UpstreamClass) + +from tests.setup.lineage.constants import (DATA_FLOW_ENTITY_TYPE, + DATA_FLOW_INFO_ASPECT_NAME, + DATA_JOB_ENTITY_TYPE, + DATA_JOB_INFO_ASPECT_NAME, + DATA_JOB_INPUT_OUTPUT_ASPECT_NAME, + DATASET_ENTITY_TYPE) +from tests.setup.lineage.helper_classes import Dataset, Pipeline def create_node(dataset: Dataset) -> List[MetadataChangeProposalWrapper]: @@ -85,10 +74,10 @@ def create_node(dataset: Dataset) -> List[MetadataChangeProposalWrapper]: def create_edge( - source_urn: str, - destination_urn: str, - created_timestamp_millis: int, - updated_timestamp_millis: int, + source_urn: str, + destination_urn: str, + created_timestamp_millis: int, + updated_timestamp_millis: int, ) -> EdgeClass: created_audit_stamp: AuditStampClass = AuditStampClass( time=created_timestamp_millis, actor="urn:li:corpuser:unknown" @@ -105,7 +94,7 @@ def create_edge( def create_nodes_and_edges( - airflow_dag: Pipeline, + airflow_dag: Pipeline, ) -> List[MetadataChangeProposalWrapper]: mcps = [] data_flow_urn = make_data_flow_urn( @@ -160,9 +149,9 @@ def create_nodes_and_edges( def create_upstream_edge( - upstream_entity_urn: str, - created_timestamp_millis: int, - updated_timestamp_millis: int, + upstream_entity_urn: str, + created_timestamp_millis: int, + updated_timestamp_millis: int, ): created_audit_stamp: AuditStampClass = AuditStampClass( time=created_timestamp_millis, actor="urn:li:corpuser:unknown" @@ -180,11 +169,11 @@ def create_upstream_edge( def create_upstream_mcp( - entity_type: str, - entity_urn: str, - upstreams: List[UpstreamClass], - timestamp_millis: int, - run_id: str = "", + entity_type: str, + entity_urn: str, + upstreams: List[UpstreamClass], + timestamp_millis: int, + run_id: str = "", ) -> MetadataChangeProposalWrapper: print(f"Creating upstreamLineage aspect for {entity_urn}") timestamp_millis: int = int(datetime.datetime.now().timestamp() * 1000) @@ -203,7 +192,7 @@ def create_upstream_mcp( def emit_mcps( - emitter: DatahubRestEmitter, mcps: List[MetadataChangeProposalWrapper] + emitter: DatahubRestEmitter, mcps: List[MetadataChangeProposalWrapper] ) -> None: for mcp in mcps: emitter.emit_mcp(mcp) diff --git a/smoke-test/tests/tags-and-terms/tags_and_terms_test.py b/smoke-test/tests/tags-and-terms/tags_and_terms_test.py index b0ca29b544cfef..6ac75765286f00 100644 --- a/smoke-test/tests/tags-and-terms/tags_and_terms_test.py +++ b/smoke-test/tests/tags-and-terms/tags_and_terms_test.py @@ -1,5 +1,7 @@ import pytest -from tests.utils import delete_urns_from_file, get_frontend_url, ingest_file_via_rest, wait_for_healthcheck_util + +from tests.utils import (delete_urns_from_file, get_frontend_url, + ingest_file_via_rest, wait_for_healthcheck_util) @pytest.fixture(scope="module", autouse=True) diff --git a/smoke-test/tests/telemetry/telemetry_test.py b/smoke-test/tests/telemetry/telemetry_test.py index 3672abcda948de..3127061c9f5061 100644 --- a/smoke-test/tests/telemetry/telemetry_test.py +++ b/smoke-test/tests/telemetry/telemetry_test.py @@ -7,5 +7,7 @@ def test_no_clientID(): client_id_urn = "urn:li:telemetry:clientId" aspect = ["telemetryClientId"] - res_data = json.dumps(get_aspects_for_entity(entity_urn=client_id_urn, aspects=aspect, typed=False)) + res_data = json.dumps( + get_aspects_for_entity(entity_urn=client_id_urn, aspects=aspect, typed=False) + ) assert res_data == "{}" diff --git a/smoke-test/tests/test_result_msg.py b/smoke-test/tests/test_result_msg.py index e3b336db9d66c4..b9775e8ee4acd3 100644 --- a/smoke-test/tests/test_result_msg.py +++ b/smoke-test/tests/test_result_msg.py @@ -1,6 +1,6 @@ -from slack_sdk import WebClient import os +from slack_sdk import WebClient datahub_stats = {} @@ -10,10 +10,10 @@ def add_datahub_stats(stat_name, stat_val): def send_to_slack(passed: str): - slack_api_token = os.getenv('SLACK_API_TOKEN') - slack_channel = os.getenv('SLACK_CHANNEL') - slack_thread_ts = os.getenv('SLACK_THREAD_TS') - test_identifier = os.getenv('TEST_IDENTIFIER', 'LOCAL_TEST') + slack_api_token = os.getenv("SLACK_API_TOKEN") + slack_channel = os.getenv("SLACK_CHANNEL") + slack_thread_ts = os.getenv("SLACK_THREAD_TS") + test_identifier = os.getenv("TEST_IDENTIFIER", "LOCAL_TEST") if slack_api_token is None or slack_channel is None: return client = WebClient(token=slack_api_token) @@ -26,14 +26,21 @@ def send_to_slack(passed: str): message += f"Num {entity_type} is {val}\n" if slack_thread_ts is None: - client.chat_postMessage(channel=slack_channel, text=f'{test_identifier} Status - {passed}\n{message}') + client.chat_postMessage( + channel=slack_channel, + text=f"{test_identifier} Status - {passed}\n{message}", + ) else: - client.chat_postMessage(channel=slack_channel, text=f'{test_identifier} Status - {passed}\n{message}', thread_ts=slack_thread_ts) + client.chat_postMessage( + channel=slack_channel, + text=f"{test_identifier} Status - {passed}\n{message}", + thread_ts=slack_thread_ts, + ) def send_message(exitstatus): try: - send_to_slack('PASSED' if exitstatus == 0 else 'FAILED') + send_to_slack("PASSED" if exitstatus == 0 else "FAILED") except Exception as e: # We don't want to fail pytest at all print(f"Exception happened for sending msg to slack {e}") diff --git a/smoke-test/tests/test_stateful_ingestion.py b/smoke-test/tests/test_stateful_ingestion.py index a10cf13a08029d..c6adb402e5d510 100644 --- a/smoke-test/tests/test_stateful_ingestion.py +++ b/smoke-test/tests/test_stateful_ingestion.py @@ -4,17 +4,15 @@ from datahub.ingestion.run.pipeline import Pipeline from datahub.ingestion.source.sql.mysql import MySQLConfig, MySQLSource from datahub.ingestion.source.state.checkpoint import Checkpoint -from datahub.ingestion.source.state.entity_removal_state import GenericCheckpointState -from datahub.ingestion.source.state.stale_entity_removal_handler import StaleEntityRemovalHandler +from datahub.ingestion.source.state.entity_removal_state import \ + GenericCheckpointState +from datahub.ingestion.source.state.stale_entity_removal_handler import \ + StaleEntityRemovalHandler from sqlalchemy import create_engine from sqlalchemy.sql import text -from tests.utils import ( - get_gms_url, - get_mysql_password, - get_mysql_url, - get_mysql_username, -) +from tests.utils import (get_gms_url, get_mysql_password, get_mysql_url, + get_mysql_username) def test_stateful_ingestion(wait_for_healthchecks): diff --git a/smoke-test/tests/tests/tests_test.py b/smoke-test/tests/tests/tests_test.py index 0b87f90a92c58e..213a2ea087b7a1 100644 --- a/smoke-test/tests/tests/tests_test.py +++ b/smoke-test/tests/tests/tests_test.py @@ -1,9 +1,13 @@ import pytest import tenacity -from tests.utils import delete_urns_from_file, get_frontend_url, ingest_file_via_rest, wait_for_healthcheck_util, get_sleep_info + +from tests.utils import (delete_urns_from_file, get_frontend_url, + get_sleep_info, ingest_file_via_rest, + wait_for_healthcheck_util) sleep_sec, sleep_times = get_sleep_info() + @pytest.fixture(scope="module", autouse=True) def ingest_cleanup_data(request): print("ingesting test data") @@ -18,6 +22,7 @@ def wait_for_healthchecks(): wait_for_healthcheck_util() yield + @pytest.mark.dependency() def test_healthchecks(wait_for_healthchecks): # Call to wait_for_healthchecks fixture will do the actual functionality. diff --git a/smoke-test/tests/timeline/timeline_test.py b/smoke-test/tests/timeline/timeline_test.py index a73d585c6c72d5..4705343c1a2baf 100644 --- a/smoke-test/tests/timeline/timeline_test.py +++ b/smoke-test/tests/timeline/timeline_test.py @@ -3,14 +3,14 @@ from datahub.cli import timeline_cli from datahub.cli.cli_utils import guess_entity_type, post_entity -from tests.utils import ingest_file_via_rest, wait_for_writes_to_sync, get_datahub_graph + +from tests.utils import (get_datahub_graph, ingest_file_via_rest, + wait_for_writes_to_sync) def test_all(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -18,8 +18,13 @@ def test_all(): ingest_file_via_rest("tests/timeline/timeline_test_datav2.json") ingest_file_via_rest("tests/timeline/timeline_test_datav3.json") - res_data = timeline_cli.get_timeline(dataset_urn, ["TAG", "DOCUMENTATION", "TECHNICAL_SCHEMA", "GLOSSARY_TERM", - "OWNER"], None, None, False) + res_data = timeline_cli.get_timeline( + dataset_urn, + ["TAG", "DOCUMENTATION", "TECHNICAL_SCHEMA", "GLOSSARY_TERM", "OWNER"], + None, + None, + False, + ) get_datahub_graph().hard_delete_entity(urn=dataset_urn) assert res_data @@ -35,9 +40,7 @@ def test_all(): def test_schema(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -45,7 +48,9 @@ def test_schema(): put(dataset_urn, "schemaMetadata", "test_resources/timeline/newschemav2.json") put(dataset_urn, "schemaMetadata", "test_resources/timeline/newschemav3.json") - res_data = timeline_cli.get_timeline(dataset_urn, ["TECHNICAL_SCHEMA"], None, None, False) + res_data = timeline_cli.get_timeline( + dataset_urn, ["TECHNICAL_SCHEMA"], None, None, False + ) get_datahub_graph().hard_delete_entity(urn=dataset_urn) assert res_data @@ -61,9 +66,7 @@ def test_schema(): def test_glossary(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -71,7 +74,9 @@ def test_glossary(): put(dataset_urn, "glossaryTerms", "test_resources/timeline/newglossaryv2.json") put(dataset_urn, "glossaryTerms", "test_resources/timeline/newglossaryv3.json") - res_data = timeline_cli.get_timeline(dataset_urn, ["GLOSSARY_TERM"], None, None, False) + res_data = timeline_cli.get_timeline( + dataset_urn, ["GLOSSARY_TERM"], None, None, False + ) get_datahub_graph().hard_delete_entity(urn=dataset_urn) assert res_data @@ -87,17 +92,29 @@ def test_glossary(): def test_documentation(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" - put(dataset_urn, "institutionalMemory", "test_resources/timeline/newdocumentation.json") - put(dataset_urn, "institutionalMemory", "test_resources/timeline/newdocumentationv2.json") - put(dataset_urn, "institutionalMemory", "test_resources/timeline/newdocumentationv3.json") + put( + dataset_urn, + "institutionalMemory", + "test_resources/timeline/newdocumentation.json", + ) + put( + dataset_urn, + "institutionalMemory", + "test_resources/timeline/newdocumentationv2.json", + ) + put( + dataset_urn, + "institutionalMemory", + "test_resources/timeline/newdocumentationv3.json", + ) - res_data = timeline_cli.get_timeline(dataset_urn, ["DOCUMENTATION"], None, None, False) + res_data = timeline_cli.get_timeline( + dataset_urn, ["DOCUMENTATION"], None, None, False + ) get_datahub_graph().hard_delete_entity(urn=dataset_urn) assert res_data @@ -113,9 +130,7 @@ def test_documentation(): def test_tags(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" @@ -139,9 +154,7 @@ def test_tags(): def test_ownership(): platform = "urn:li:dataPlatform:kafka" - dataset_name = ( - "test-timeline-sample-kafka" - ) + dataset_name = "test-timeline-sample-kafka" env = "PROD" dataset_urn = f"urn:li:dataset:({platform},{dataset_name},{env})" diff --git a/smoke-test/tests/tokens/revokable_access_token_test.py b/smoke-test/tests/tokens/revokable_access_token_test.py index b10ad3aa3fc2a2..55f3de594af4e2 100644 --- a/smoke-test/tests/tokens/revokable_access_token_test.py +++ b/smoke-test/tests/tokens/revokable_access_token_test.py @@ -1,15 +1,11 @@ import os -import pytest -import requests from time import sleep -from tests.utils import ( - get_frontend_url, - wait_for_healthcheck_util, - get_admin_credentials, - wait_for_writes_to_sync, -) +import pytest +import requests +from tests.utils import (get_admin_credentials, get_frontend_url, + wait_for_healthcheck_util, wait_for_writes_to_sync) # Disable telemetry os.environ["DATAHUB_TELEMETRY_ENABLED"] = "false" diff --git a/smoke-test/tests/utils.py b/smoke-test/tests/utils.py index af03efd4f71f8c..bd75b13d1910f6 100644 --- a/smoke-test/tests/utils.py +++ b/smoke-test/tests/utils.py @@ -1,19 +1,20 @@ import functools import json +import logging import os -from datetime import datetime, timedelta, timezone import subprocess import time -from typing import Any, Dict, List, Tuple +from datetime import datetime, timedelta, timezone from time import sleep -from joblib import Parallel, delayed +from typing import Any, Dict, List, Tuple -import requests_wrapper as requests -import logging from datahub.cli import cli_utils from datahub.cli.cli_utils import get_system_auth -from datahub.ingestion.graph.client import DataHubGraph, DatahubClientConfig +from datahub.ingestion.graph.client import DatahubClientConfig, DataHubGraph from datahub.ingestion.run.pipeline import Pipeline +from joblib import Parallel, delayed + +import requests_wrapper as requests from tests.consistency_utils import wait_for_writes_to_sync TIME: int = 1581407189000 @@ -174,6 +175,7 @@ def delete(entry): wait_for_writes_to_sync() + # Fixed now value NOW: datetime = datetime.now() @@ -232,6 +234,3 @@ def create_datahub_step_state_aspects( ] with open(onboarding_filename, "w") as f: json.dump(aspects_dict, f, indent=2) - - - diff --git a/smoke-test/tests/views/views_test.py b/smoke-test/tests/views/views_test.py index 4da69750a167b1..685c3bd80b04d8 100644 --- a/smoke-test/tests/views/views_test.py +++ b/smoke-test/tests/views/views_test.py @@ -1,16 +1,14 @@ -import pytest import time + +import pytest import tenacity -from tests.utils import ( - delete_urns_from_file, - get_frontend_url, - get_gms_url, - ingest_file_via_rest, - get_sleep_info, -) + +from tests.utils import (delete_urns_from_file, get_frontend_url, get_gms_url, + get_sleep_info, ingest_file_via_rest) sleep_sec, sleep_times = get_sleep_info() + @pytest.mark.dependency() def test_healthchecks(wait_for_healthchecks): # Call to wait_for_healthchecks fixture will do the actual functionality. @@ -40,6 +38,7 @@ def _ensure_more_views(frontend_session, list_views_json, query_name, before_cou assert after_count == before_count + 1 return after_count + @tenacity.retry( stop=tenacity.stop_after_attempt(sleep_times), wait=tenacity.wait_fixed(sleep_sec) ) @@ -111,18 +110,18 @@ def test_create_list_delete_global_view(frontend_session): new_view_name = "Test View" new_view_description = "Test Description" new_view_definition = { - "entityTypes": ["DATASET", "DASHBOARD"], - "filter": { - "operator": "AND", - "filters": [ - { - "field": "tags", - "values": ["urn:li:tag:test"], - "negated": False, - "condition": "EQUAL" - } - ] - } + "entityTypes": ["DATASET", "DASHBOARD"], + "filter": { + "operator": "AND", + "filters": [ + { + "field": "tags", + "values": ["urn:li:tag:test"], + "negated": False, + "condition": "EQUAL", + } + ], + }, } # Create new View @@ -137,7 +136,7 @@ def test_create_list_delete_global_view(frontend_session): "viewType": "GLOBAL", "name": new_view_name, "description": new_view_description, - "definition": new_view_definition + "definition": new_view_definition, } }, } @@ -169,9 +168,7 @@ def test_create_list_delete_global_view(frontend_session): "query": """mutation deleteView($urn: String!) {\n deleteView(urn: $urn) }""", - "variables": { - "urn": view_urn - }, + "variables": {"urn": view_urn}, } response = frontend_session.post( @@ -189,7 +186,9 @@ def test_create_list_delete_global_view(frontend_session): ) -@pytest.mark.dependency(depends=["test_healthchecks", "test_create_list_delete_global_view"]) +@pytest.mark.dependency( + depends=["test_healthchecks", "test_create_list_delete_global_view"] +) def test_create_list_delete_personal_view(frontend_session): # Get count of existing views @@ -237,18 +236,18 @@ def test_create_list_delete_personal_view(frontend_session): new_view_name = "Test View" new_view_description = "Test Description" new_view_definition = { - "entityTypes": ["DATASET", "DASHBOARD"], - "filter": { - "operator": "AND", - "filters": [ - { - "field": "tags", - "values": ["urn:li:tag:test"], - "negated": False, - "condition": "EQUAL" - } - ] - } + "entityTypes": ["DATASET", "DASHBOARD"], + "filter": { + "operator": "AND", + "filters": [ + { + "field": "tags", + "values": ["urn:li:tag:test"], + "negated": False, + "condition": "EQUAL", + } + ], + }, } # Create new View @@ -263,7 +262,7 @@ def test_create_list_delete_personal_view(frontend_session): "viewType": "PERSONAL", "name": new_view_name, "description": new_view_description, - "definition": new_view_definition + "definition": new_view_definition, } }, } @@ -293,9 +292,7 @@ def test_create_list_delete_personal_view(frontend_session): "query": """mutation deleteView($urn: String!) {\n deleteView(urn: $urn) }""", - "variables": { - "urn": view_urn - }, + "variables": {"urn": view_urn}, } response = frontend_session.post( @@ -312,25 +309,28 @@ def test_create_list_delete_personal_view(frontend_session): before_count=new_count, ) -@pytest.mark.dependency(depends=["test_healthchecks", "test_create_list_delete_personal_view"]) + +@pytest.mark.dependency( + depends=["test_healthchecks", "test_create_list_delete_personal_view"] +) def test_update_global_view(frontend_session): # First create a view new_view_name = "Test View" new_view_description = "Test Description" new_view_definition = { - "entityTypes": ["DATASET", "DASHBOARD"], - "filter": { - "operator": "AND", - "filters": [ - { - "field": "tags", - "values": ["urn:li:tag:test"], - "negated": False, - "condition": "EQUAL" - } - ] - } + "entityTypes": ["DATASET", "DASHBOARD"], + "filter": { + "operator": "AND", + "filters": [ + { + "field": "tags", + "values": ["urn:li:tag:test"], + "negated": False, + "condition": "EQUAL", + } + ], + }, } # Create new View @@ -345,7 +345,7 @@ def test_update_global_view(frontend_session): "viewType": "PERSONAL", "name": new_view_name, "description": new_view_description, - "definition": new_view_definition + "definition": new_view_definition, } }, } @@ -366,18 +366,18 @@ def test_update_global_view(frontend_session): new_view_name = "New Test View" new_view_description = "New Test Description" new_view_definition = { - "entityTypes": ["DATASET", "DASHBOARD", "CHART", "DATA_FLOW"], - "filter": { - "operator": "OR", - "filters": [ - { - "field": "glossaryTerms", - "values": ["urn:li:glossaryTerm:test"], - "negated": True, - "condition": "CONTAIN" - } - ] - } + "entityTypes": ["DATASET", "DASHBOARD", "CHART", "DATA_FLOW"], + "filter": { + "operator": "OR", + "filters": [ + { + "field": "glossaryTerms", + "values": ["urn:li:glossaryTerm:test"], + "negated": True, + "condition": "CONTAIN", + } + ], + }, } update_view_json = { @@ -391,8 +391,8 @@ def test_update_global_view(frontend_session): "input": { "name": new_view_name, "description": new_view_description, - "definition": new_view_definition - } + "definition": new_view_definition, + }, }, } @@ -411,9 +411,7 @@ def test_update_global_view(frontend_session): "query": """mutation deleteView($urn: String!) {\n deleteView(urn: $urn) }""", - "variables": { - "urn": view_urn - }, + "variables": {"urn": view_urn}, } response = frontend_session.post(