Merge branch 'datahub-project:master' into master

acryldata · Jun 26, 2024 · 425b9cb · 425b9cb
2 parents 3b32ec6 + 5df6468
commit 425b9cb
Show file tree

Hide file tree

Showing 16 changed files with 335 additions and 327 deletions.
diff --git a/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx b/datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx
@@ -119,11 +119,11 @@ export class DatasetEntity implements Entity<Dataset> {
                     component: ViewDefinitionTab,
                     display: {
                         visible: (_, dataset: GetDatasetQuery) =>
-                            dataset?.dataset?.subTypes?.typeNames
+                            !!dataset?.dataset?.viewProperties?.logic ||
+                            !!dataset?.dataset?.subTypes?.typeNames
                                 ?.map((t) => t.toLocaleLowerCase())
-                                .includes(SUBTYPES.VIEW.toLocaleLowerCase()) || false,
-                        enabled: (_, dataset: GetDatasetQuery) =>
-                            (dataset?.dataset?.viewProperties?.logic && true) || false,
+                                .includes(SUBTYPES.VIEW.toLocaleLowerCase()),
+                        enabled: (_, dataset: GetDatasetQuery) => !!dataset?.dataset?.viewProperties?.logic,
                     },
                 },
                 {
@@ -178,8 +178,7 @@ export class DatasetEntity implements Entity<Dataset> {
                     },
                 },
                 {
-                    name: 'Runs',
-                    // TODO: Rename this to DatasetRunsTab.
+                    name: 'Runs', // TODO: Rename this to DatasetRunsTab.
                     component: OperationsTab,
                     display: {
                         visible: (_, dataset: GetDatasetQuery) => {
@@ -234,7 +233,7 @@ export class DatasetEntity implements Entity<Dataset> {
         {
             component: SidebarViewDefinitionSection,
             display: {
-                visible: (_, dataset: GetDatasetQuery) => (dataset?.dataset?.viewProperties?.logic && true) || false,
+                visible: (_, dataset: GetDatasetQuery) => !!dataset?.dataset?.viewProperties?.logic,
             },
         },
         {
@@ -249,8 +248,7 @@ export class DatasetEntity implements Entity<Dataset> {
         },
         {
             component: DataProductSection,
-        },
-        // TODO: Add back once entity-level recommendations are complete.
+        }, // TODO: Add back once entity-level recommendations are complete.
         // {
         //    component: SidebarRecommendationsSection,
         // },

diff --git a/docker/profiles/docker-compose.gms.yml b/docker/profiles/docker-compose.gms.yml
@@ -99,8 +99,7 @@ x-datahub-gms-service: &datahub-gms-service
     - ${DATAHUB_LOCAL_GMS_ENV:-empty2.env}
   environment: &datahub-gms-env
     <<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env]
-    ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED: true
-    ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: '/etc/datahub/search/search_config.yaml'
+    ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml}
   healthcheck:
     test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health
     start_period: 90s
@@ -119,8 +118,13 @@ x-datahub-gms-service-dev: &datahub-gms-service-dev
   ports:
     - ${DATAHUB_MAPPED_GMS_DEBUG_PORT:-5001}:5001
     - ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
+  env_file:
+    - datahub-gms/env/docker.env
+    - ${DATAHUB_LOCAL_COMMON_ENV:-empty.env}
+    - ${DATAHUB_LOCAL_GMS_ENV:-empty2.env}
   environment: &datahub-gms-dev-env
     <<: [*datahub-dev-telemetry-env, *datahub-gms-env]
+    ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-/etc/datahub/search/search_config.yaml}
     SKIP_ELASTICSEARCH_CHECK: false
     JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5001'
     BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE: false

diff --git a/docs-website/sidebars.js b/docs-website/sidebars.js
@@ -272,6 +272,7 @@ module.exports = {
         },
         {
           "Managed DataHub Release History": [
+            "docs/managed-datahub/release-notes/v_0_3_3",
             "docs/managed-datahub/release-notes/v_0_3_2",
             "docs/managed-datahub/release-notes/v_0_3_1",
             "docs/managed-datahub/release-notes/v_0_2_16",

diff --git a/docs/deploy/environment-vars.md b/docs/deploy/environment-vars.md
@@ -60,7 +60,7 @@ DataHub works.
 | `ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED` | `true`              | boolean   | [`GMS`]                                                         | When using structured query, also include exact matches.                 |
 | `ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR`            | 0.5                 | float     | [`GMS`]                                                         | Multiply by this number when partial token match on URN)                 |
 | `ELASTICSEARCH_QUERY_PARTIAL_FACTOR`                | 0.4                 | float     | [`GMS`]                                                         | Multiply by this number when partial token match on non-URN field.       |
-| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED`         | `false`             | boolean   | [`GMS`]                                                         | Enable search query and ranking customization configuration.             |
+| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED`         | `true`              | boolean   | [`GMS`]                                                         | Enable search query and ranking customization configuration.             |
 | `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE`            | `search_config.yml` | string    | [`GMS`]                                                         | The location of the search customization configuration.                  |
 | `ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX`      | `false`             | boolean   | [`System Update`]                                               | Enable reindexing on Elasticsearch schema changes.                       |
 | `ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE`        | `false`             | boolean   | [`System Update`]                                               | Enable reindexing to remove hard deleted structured properties.          |

diff --git a/docs/managed-datahub/release-notes/v_0_3_3.md b/docs/managed-datahub/release-notes/v_0_3_3.md
@@ -0,0 +1,23 @@
+# v0.3.3
+---
+
+Release Availability Date
+---
+25-June-2024
+
+Recommended CLI/SDK
+---
+- `v0.13.3` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.13.3
+
+If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better.
+
+## Release Changelog
+---
+
+- Ability to run assertions "on-demand" from the Acryl UI (Reach out to Acryl to enable this feature)
+- Richer Incident Slack Messages - Resolve and reopen incidents from Slack. Reflect the latest status of incidents in the same slack message. Richer details about incident when raised via Assertion
+- Assertion-Level Subscriptions - Subscribe to be notified when a specific assertion passes, fails, or errors out.
+- Misc fixes with 2.0 UI (new loading indicators, fixed text overflows, consistent entity health badges)
+- Structured Property Schema Change & Delete
+- Since `v0.3.2` these changes from OSS DataHub https://github.com/datahub-project/datahub/compare/6ed21bd1bc70a3ceb7dddb43ea7db4ca56874547...92e9a5823bc14e81f0f21c55a68c493c3bbe87b9 have been pulled in.
+
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
@@ -327,45 +327,6 @@ def operational_data_for_time_window(
         ORDER BY query_start_time DESC
         ;"""
 
-    @staticmethod
-    def table_to_table_lineage_history(
-        start_time_millis: int,
-        end_time_millis: int,
-        include_column_lineage: bool = True,
-    ) -> str:
-        return f"""
-        WITH table_lineage_history AS (
-            SELECT
-                r.value:"objectName"::varchar AS upstream_table_name,
-                r.value:"objectDomain"::varchar AS upstream_table_domain,
-                r.value:"columns" AS upstream_table_columns,
-                w.value:"objectName"::varchar AS downstream_table_name,
-                w.value:"objectDomain"::varchar AS downstream_table_domain,
-                w.value:"columns" AS downstream_table_columns,
-                t.query_start_time AS query_start_time
-            FROM
-                (SELECT * from snowflake.account_usage.access_history) t,
-                lateral flatten(input => t.DIRECT_OBJECTS_ACCESSED) r,
-                lateral flatten(input => t.OBJECTS_MODIFIED) w
-            WHERE r.value:"objectId" IS NOT NULL
-            AND w.value:"objectId" IS NOT NULL
-            AND w.value:"objectName" NOT LIKE '%.GE_TMP_%'
-            AND w.value:"objectName" NOT LIKE '%.GE_TEMP_%'
-            AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
-            AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3))
-        SELECT
-        upstream_table_name AS "UPSTREAM_TABLE_NAME",
-        downstream_table_name AS "DOWNSTREAM_TABLE_NAME",
-        upstream_table_columns AS "UPSTREAM_TABLE_COLUMNS",
-        downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS"
-        FROM table_lineage_history
-        WHERE upstream_table_domain in ('Table', 'External table') and downstream_table_domain = 'Table'
-        QUALIFY ROW_NUMBER() OVER (
-            PARTITION BY downstream_table_name,
-            upstream_table_name{", downstream_table_columns" if include_column_lineage else ""}
-            ORDER BY query_start_time DESC
-        ) = 1"""
-
     @staticmethod
     def view_dependencies() -> str:
         return """
@@ -386,58 +347,6 @@ def view_dependencies() -> str:
           referencing_object_domain in ('VIEW', 'MATERIALIZED VIEW')
         """
 
-    @staticmethod
-    def view_lineage_history(
-        start_time_millis: int,
-        end_time_millis: int,
-        include_column_lineage: bool = True,
-    ) -> str:
-        return f"""
-        WITH view_lineage_history AS (
-          SELECT
-            vu.value : "objectName"::varchar AS view_name,
-            vu.value : "objectDomain"::varchar AS view_domain,
-            vu.value : "columns" AS view_columns,
-            w.value : "objectName"::varchar AS downstream_table_name,
-            w.value : "objectDomain"::varchar AS downstream_table_domain,
-            w.value : "columns" AS downstream_table_columns,
-            t.query_start_time AS query_start_time
-          FROM
-            (
-              SELECT
-                *
-              FROM
-                snowflake.account_usage.access_history
-            ) t,
-            lateral flatten(input => t.DIRECT_OBJECTS_ACCESSED) vu,
-            lateral flatten(input => t.OBJECTS_MODIFIED) w
-          WHERE
-            vu.value : "objectId" IS NOT NULL
-            AND w.value : "objectId" IS NOT NULL
-            AND w.value : "objectName" NOT LIKE '%.GE_TMP_%'
-            AND w.value : "objectName" NOT LIKE '%.GE_TEMP_%'
-            AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
-            AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
-        )
-        SELECT
-          view_name AS "VIEW_NAME",
-          view_domain AS "VIEW_DOMAIN",
-          view_columns AS "VIEW_COLUMNS",
-          downstream_table_name AS "DOWNSTREAM_TABLE_NAME",
-          downstream_table_domain AS "DOWNSTREAM_TABLE_DOMAIN",
-          downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS"
-        FROM
-          view_lineage_history
-        WHERE
-          view_domain in ('View', 'Materialized view')
-          QUALIFY ROW_NUMBER() OVER (
-            PARTITION BY view_name,
-            downstream_table_name {", downstream_table_columns" if include_column_lineage else ""}
-            ORDER BY
-              query_start_time DESC
-          ) = 1
-        """
-
     # Note on use of `upstreams_deny_pattern` to ignore temporary tables:
     # Snowflake access history may include temporary tables in DIRECT_OBJECTS_ACCESSED and
     # OBJECTS_MODIFIED->columns->directSources. We do not need these temporary tables and filter these in the query.
@@ -773,7 +682,12 @@ def table_upstreams_with_column_lineage(
                 t.query_start_time AS query_start_time,
                 t.query_id AS query_id
             FROM
-                (SELECT * from snowflake.account_usage.access_history) t,
+                (
+                    SELECT * from snowflake.account_usage.access_history
+                    WHERE
+                        query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
+                        AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
+                ) t,
                 lateral flatten(input => t.DIRECT_OBJECTS_ACCESSED) r,
                 lateral flatten(input => t.OBJECTS_MODIFIED) w,
                 lateral flatten(input => w.value : "columns", outer => true) wcols,
@@ -933,7 +847,12 @@ def table_upstreams_only(
                 t.query_start_time AS query_start_time,
                 t.query_id AS query_id
             FROM
-                (SELECT * from snowflake.account_usage.access_history) t,
+                (
+                    SELECT * from snowflake.account_usage.access_history
+                    WHERE
+                        query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
+                        AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
+                ) t,
                 lateral flatten(input => t.DIRECT_OBJECTS_ACCESSED) r,
                 lateral flatten(input => t.OBJECTS_MODIFIED) w
             WHERE

diff --git a/metadata-ingestion/src/datahub/utilities/prefix_batch_builder.py b/metadata-ingestion/src/datahub/utilities/prefix_batch_builder.py
@@ -35,7 +35,7 @@ def split_group(group: PrefixGroup) -> List[PrefixGroup]:
         prefix_length = len(group.prefix) + 1
         subgroups = defaultdict(list)
         for name in group.names:
-            if len(name) <= prefix_length:
+            if len(name) < prefix_length:
                 # Handle cases where a single name is also the prefix for a large number of names.
                 # For example, if NAME and NAME_{1..10000} are both in the list.
                 result.append(PrefixGroup(prefix=name, names=[name], exact_match=True))

diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py
@@ -433,69 +433,6 @@ def default_query_results(  # noqa: C901
             for i in range(num_usages)
         ]
         return mock
-    elif query in (
-        snowflake_query.SnowflakeQuery.table_to_table_lineage_history(
-            1654473600000,
-            1654586220000,
-        ),
-        snowflake_query.SnowflakeQuery.table_to_table_lineage_history(
-            1654473600000, 1654586220000, False
-        ),
-    ):
-        return [
-            {
-                "DOWNSTREAM_TABLE_NAME": f"TEST_DB.TEST_SCHEMA.TABLE_{op_idx}",
-                "UPSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_2",
-                "UPSTREAM_TABLE_COLUMNS": json.dumps(
-                    [
-                        {"columnId": 0, "columnName": f"COL_{col_idx}"}
-                        for col_idx in range(1, num_cols + 1)
-                    ]
-                ),
-                "DOWNSTREAM_TABLE_COLUMNS": json.dumps(
-                    [
-                        {
-                            "columnId": 0,
-                            "columnName": f"COL_{col_idx}",
-                            "directSources": [
-                                {
-                                    "columnName": f"COL_{col_idx}",
-                                    "objectDomain": "Table",
-                                    "objectId": 0,
-                                    "objectName": "TEST_DB.TEST_SCHEMA.TABLE_2",
-                                }
-                            ],
-                        }
-                        for col_idx in range(1, num_cols + 1)
-                    ]
-                ),
-            }
-            for op_idx in range(1, num_ops + 1)
-        ] + [
-            {
-                "DOWNSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_1",
-                "UPSTREAM_TABLE_NAME": "OTHER_DB.OTHER_SCHEMA.TABLE_1",
-                "UPSTREAM_TABLE_COLUMNS": json.dumps(
-                    [{"columnId": 0, "columnName": "COL_1"}]
-                ),
-                "DOWNSTREAM_TABLE_COLUMNS": json.dumps(
-                    [
-                        {
-                            "columnId": 0,
-                            "columnName": "COL_1",
-                            "directSources": [
-                                {
-                                    "columnName": "COL_1",
-                                    "objectDomain": "Table",
-                                    "objectId": 0,
-                                    "objectName": "OTHER_DB.OTHER_SCHEMA.TABLE_1",
-                                }
-                            ],
-                        }
-                    ]
-                ),
-            }
-        ]
     elif query in (
         snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2(
             start_time_millis=1654473600000,
@@ -662,46 +599,6 @@ def default_query_results(  # noqa: C901
                 ),
             }
         ]
-    elif query in [
-        snowflake_query.SnowflakeQuery.view_lineage_history(
-            1654473600000,
-            1654586220000,
-        ),
-        snowflake_query.SnowflakeQuery.view_lineage_history(
-            1654473600000, 1654586220000, False
-        ),
-    ]:
-        return [
-            {
-                "DOWNSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_1",
-                "VIEW_NAME": "TEST_DB.TEST_SCHEMA.VIEW_1",
-                "VIEW_DOMAIN": "VIEW",
-                "VIEW_COLUMNS": json.dumps(
-                    [
-                        {"columnId": 0, "columnName": f"COL_{col_idx}"}
-                        for col_idx in range(1, num_cols + 1)
-                    ]
-                ),
-                "DOWNSTREAM_TABLE_DOMAIN": "TABLE",
-                "DOWNSTREAM_TABLE_COLUMNS": json.dumps(
-                    [
-                        {
-                            "columnId": 0,
-                            "columnName": f"COL_{col_idx}",
-                            "directSources": [
-                                {
-                                    "columnName": f"COL_{col_idx}",
-                                    "objectDomain": "Table",
-                                    "objectId": 0,
-                                    "objectName": "TEST_DB.TEST_SCHEMA.TABLE_2",
-                                }
-                            ],
-                        }
-                        for col_idx in range(1, num_cols + 1)
-                    ]
-                ),
-            }
-        ]
     elif query in [
         snowflake_query.SnowflakeQuery.view_dependencies_v2(),
         snowflake_query.SnowflakeQuery.view_dependencies(),

diff --git a/metadata-ingestion/tests/unit/utilities/test_prefix_patch_builder.py b/metadata-ingestion/tests/unit/utilities/test_prefix_patch_builder.py
@@ -35,7 +35,7 @@ def test_build_prefix_batches_exceeds_max_batch_size():
     ]
     expected = [
         [PrefixGroup(prefix="app", names=["app"], exact_match=True)],
-        [PrefixGroup(prefix="app", names=["apple", "applet", "application"])],
+        [PrefixGroup(prefix="appl", names=["apple", "applet", "application"])],
         [PrefixGroup(prefix="b", names=["banana", "band", "bandana"])],
         [
             PrefixGroup(prefix="c", names=["candle", "candy"]),