Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Jun 26, 2024
2 parents 3b32ec6 + 5df6468 commit 425b9cb
Show file tree
Hide file tree
Showing 16 changed files with 335 additions and 327 deletions.
16 changes: 7 additions & 9 deletions datahub-web-react/src/app/entity/dataset/DatasetEntity.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,11 @@ export class DatasetEntity implements Entity<Dataset> {
component: ViewDefinitionTab,
display: {
visible: (_, dataset: GetDatasetQuery) =>
dataset?.dataset?.subTypes?.typeNames
!!dataset?.dataset?.viewProperties?.logic ||
!!dataset?.dataset?.subTypes?.typeNames
?.map((t) => t.toLocaleLowerCase())
.includes(SUBTYPES.VIEW.toLocaleLowerCase()) || false,
enabled: (_, dataset: GetDatasetQuery) =>
(dataset?.dataset?.viewProperties?.logic && true) || false,
.includes(SUBTYPES.VIEW.toLocaleLowerCase()),
enabled: (_, dataset: GetDatasetQuery) => !!dataset?.dataset?.viewProperties?.logic,
},
},
{
Expand Down Expand Up @@ -178,8 +178,7 @@ export class DatasetEntity implements Entity<Dataset> {
},
},
{
name: 'Runs',
// TODO: Rename this to DatasetRunsTab.
name: 'Runs', // TODO: Rename this to DatasetRunsTab.
component: OperationsTab,
display: {
visible: (_, dataset: GetDatasetQuery) => {
Expand Down Expand Up @@ -234,7 +233,7 @@ export class DatasetEntity implements Entity<Dataset> {
{
component: SidebarViewDefinitionSection,
display: {
visible: (_, dataset: GetDatasetQuery) => (dataset?.dataset?.viewProperties?.logic && true) || false,
visible: (_, dataset: GetDatasetQuery) => !!dataset?.dataset?.viewProperties?.logic,
},
},
{
Expand All @@ -249,8 +248,7 @@ export class DatasetEntity implements Entity<Dataset> {
},
{
component: DataProductSection,
},
// TODO: Add back once entity-level recommendations are complete.
}, // TODO: Add back once entity-level recommendations are complete.
// {
// component: SidebarRecommendationsSection,
// },
Expand Down
8 changes: 6 additions & 2 deletions docker/profiles/docker-compose.gms.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,7 @@ x-datahub-gms-service: &datahub-gms-service
- ${DATAHUB_LOCAL_GMS_ENV:-empty2.env}
environment: &datahub-gms-env
<<: [*primary-datastore-mysql-env, *graph-datastore-search-env, *search-datastore-env, *datahub-quickstart-telemetry-env, *kafka-env]
ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED: true
ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: '/etc/datahub/search/search_config.yaml'
ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-search_config.yaml}
healthcheck:
test: curl -sS --fail http://datahub-gms:${DATAHUB_GMS_PORT:-8080}/health
start_period: 90s
Expand All @@ -119,8 +118,13 @@ x-datahub-gms-service-dev: &datahub-gms-service-dev
ports:
- ${DATAHUB_MAPPED_GMS_DEBUG_PORT:-5001}:5001
- ${DATAHUB_MAPPED_GMS_PORT:-8080}:8080
env_file:
- datahub-gms/env/docker.env
- ${DATAHUB_LOCAL_COMMON_ENV:-empty.env}
- ${DATAHUB_LOCAL_GMS_ENV:-empty2.env}
environment: &datahub-gms-dev-env
<<: [*datahub-dev-telemetry-env, *datahub-gms-env]
ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE: ${ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE:-/etc/datahub/search/search_config.yaml}
SKIP_ELASTICSEARCH_CHECK: false
JAVA_TOOL_OPTIONS: '-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5001'
BOOTSTRAP_SYSTEM_UPDATE_WAIT_FOR_SYSTEM_UPDATE: false
Expand Down
1 change: 1 addition & 0 deletions docs-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ module.exports = {
},
{
"Managed DataHub Release History": [
"docs/managed-datahub/release-notes/v_0_3_3",
"docs/managed-datahub/release-notes/v_0_3_2",
"docs/managed-datahub/release-notes/v_0_3_1",
"docs/managed-datahub/release-notes/v_0_2_16",
Expand Down
2 changes: 1 addition & 1 deletion docs/deploy/environment-vars.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ DataHub works.
| `ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED` | `true` | boolean | [`GMS`] | When using structured query, also include exact matches. |
| `ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR` | 0.5 | float | [`GMS`] | Multiply by this number when partial token match on URN) |
| `ELASTICSEARCH_QUERY_PARTIAL_FACTOR` | 0.4 | float | [`GMS`] | Multiply by this number when partial token match on non-URN field. |
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED` | `false` | boolean | [`GMS`] | Enable search query and ranking customization configuration. |
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_ENABLED` | `true` | boolean | [`GMS`] | Enable search query and ranking customization configuration. |
| `ELASTICSEARCH_QUERY_CUSTOM_CONFIG_FILE` | `search_config.yml` | string | [`GMS`] | The location of the search customization configuration. |
| `ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX` | `false` | boolean | [`System Update`] | Enable reindexing on Elasticsearch schema changes. |
| `ENABLE_STRUCTURED_PROPERTIES_SYSTEM_UPDATE` | `false` | boolean | [`System Update`] | Enable reindexing to remove hard deleted structured properties. |
Expand Down
23 changes: 23 additions & 0 deletions docs/managed-datahub/release-notes/v_0_3_3.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# v0.3.3
---

Release Availability Date
---
25-June-2024

Recommended CLI/SDK
---
- `v0.13.3` with release notes at https://github.com/acryldata/datahub/releases/tag/v0.13.3

If you are using an older CLI/SDK version then please upgrade it. This applies for all CLI/SDK usages, if you are using it through your terminal, github actions, airflow, in python SDK somewhere, Java SKD etc. This is a strong recommendation to upgrade as we keep on pushing fixes in the CLI and it helps us support you better.

## Release Changelog
---

- Ability to run assertions "on-demand" from the Acryl UI (Reach out to Acryl to enable this feature)
- Richer Incident Slack Messages - Resolve and reopen incidents from Slack. Reflect the latest status of incidents in the same slack message. Richer details about incident when raised via Assertion
- Assertion-Level Subscriptions - Subscribe to be notified when a specific assertion passes, fails, or errors out.
- Misc fixes with 2.0 UI (new loading indicators, fixed text overflows, consistent entity health badges)
- Structured Property Schema Change & Delete
- Since `v0.3.2` these changes from OSS DataHub https://github.com/datahub-project/datahub/compare/6ed21bd1bc70a3ceb7dddb43ea7db4ca56874547...92e9a5823bc14e81f0f21c55a68c493c3bbe87b9 have been pulled in.

Original file line number Diff line number Diff line change
Expand Up @@ -327,45 +327,6 @@ def operational_data_for_time_window(
ORDER BY query_start_time DESC
;"""

@staticmethod
def table_to_table_lineage_history(
start_time_millis: int,
end_time_millis: int,
include_column_lineage: bool = True,
) -> str:
return f"""
WITH table_lineage_history AS (
SELECT
r.value:"objectName"::varchar AS upstream_table_name,
r.value:"objectDomain"::varchar AS upstream_table_domain,
r.value:"columns" AS upstream_table_columns,
w.value:"objectName"::varchar AS downstream_table_name,
w.value:"objectDomain"::varchar AS downstream_table_domain,
w.value:"columns" AS downstream_table_columns,
t.query_start_time AS query_start_time
FROM
(SELECT * from snowflake.account_usage.access_history) t,
lateral flatten(input => t.DIRECT_OBJECTS_ACCESSED) r,
lateral flatten(input => t.OBJECTS_MODIFIED) w
WHERE r.value:"objectId" IS NOT NULL
AND w.value:"objectId" IS NOT NULL
AND w.value:"objectName" NOT LIKE '%.GE_TMP_%'
AND w.value:"objectName" NOT LIKE '%.GE_TEMP_%'
AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3))
SELECT
upstream_table_name AS "UPSTREAM_TABLE_NAME",
downstream_table_name AS "DOWNSTREAM_TABLE_NAME",
upstream_table_columns AS "UPSTREAM_TABLE_COLUMNS",
downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS"
FROM table_lineage_history
WHERE upstream_table_domain in ('Table', 'External table') and downstream_table_domain = 'Table'
QUALIFY ROW_NUMBER() OVER (
PARTITION BY downstream_table_name,
upstream_table_name{", downstream_table_columns" if include_column_lineage else ""}
ORDER BY query_start_time DESC
) = 1"""

@staticmethod
def view_dependencies() -> str:
return """
Expand All @@ -386,58 +347,6 @@ def view_dependencies() -> str:
referencing_object_domain in ('VIEW', 'MATERIALIZED VIEW')
"""

@staticmethod
def view_lineage_history(
start_time_millis: int,
end_time_millis: int,
include_column_lineage: bool = True,
) -> str:
return f"""
WITH view_lineage_history AS (
SELECT
vu.value : "objectName"::varchar AS view_name,
vu.value : "objectDomain"::varchar AS view_domain,
vu.value : "columns" AS view_columns,
w.value : "objectName"::varchar AS downstream_table_name,
w.value : "objectDomain"::varchar AS downstream_table_domain,
w.value : "columns" AS downstream_table_columns,
t.query_start_time AS query_start_time
FROM
(
SELECT
*
FROM
snowflake.account_usage.access_history
) t,
lateral flatten(input => t.DIRECT_OBJECTS_ACCESSED) vu,
lateral flatten(input => t.OBJECTS_MODIFIED) w
WHERE
vu.value : "objectId" IS NOT NULL
AND w.value : "objectId" IS NOT NULL
AND w.value : "objectName" NOT LIKE '%.GE_TMP_%'
AND w.value : "objectName" NOT LIKE '%.GE_TEMP_%'
AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3)
)
SELECT
view_name AS "VIEW_NAME",
view_domain AS "VIEW_DOMAIN",
view_columns AS "VIEW_COLUMNS",
downstream_table_name AS "DOWNSTREAM_TABLE_NAME",
downstream_table_domain AS "DOWNSTREAM_TABLE_DOMAIN",
downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS"
FROM
view_lineage_history
WHERE
view_domain in ('View', 'Materialized view')
QUALIFY ROW_NUMBER() OVER (
PARTITION BY view_name,
downstream_table_name {", downstream_table_columns" if include_column_lineage else ""}
ORDER BY
query_start_time DESC
) = 1
"""

# Note on use of `upstreams_deny_pattern` to ignore temporary tables:
# Snowflake access history may include temporary tables in DIRECT_OBJECTS_ACCESSED and
# OBJECTS_MODIFIED->columns->directSources. We do not need these temporary tables and filter these in the query.
Expand Down Expand Up @@ -773,7 +682,12 @@ def table_upstreams_with_column_lineage(
t.query_start_time AS query_start_time,
t.query_id AS query_id
FROM
(SELECT * from snowflake.account_usage.access_history) t,
(
SELECT * from snowflake.account_usage.access_history
WHERE
query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
) t,
lateral flatten(input => t.DIRECT_OBJECTS_ACCESSED) r,
lateral flatten(input => t.OBJECTS_MODIFIED) w,
lateral flatten(input => w.value : "columns", outer => true) wcols,
Expand Down Expand Up @@ -933,7 +847,12 @@ def table_upstreams_only(
t.query_start_time AS query_start_time,
t.query_id AS query_id
FROM
(SELECT * from snowflake.account_usage.access_history) t,
(
SELECT * from snowflake.account_usage.access_history
WHERE
query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
AND query_start_time < to_timestamp_ltz({end_time_millis}, 3)
) t,
lateral flatten(input => t.DIRECT_OBJECTS_ACCESSED) r,
lateral flatten(input => t.OBJECTS_MODIFIED) w
WHERE
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def split_group(group: PrefixGroup) -> List[PrefixGroup]:
prefix_length = len(group.prefix) + 1
subgroups = defaultdict(list)
for name in group.names:
if len(name) <= prefix_length:
if len(name) < prefix_length:
# Handle cases where a single name is also the prefix for a large number of names.
# For example, if NAME and NAME_{1..10000} are both in the list.
result.append(PrefixGroup(prefix=name, names=[name], exact_match=True))
Expand Down
103 changes: 0 additions & 103 deletions metadata-ingestion/tests/integration/snowflake/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,69 +433,6 @@ def default_query_results( # noqa: C901
for i in range(num_usages)
]
return mock
elif query in (
snowflake_query.SnowflakeQuery.table_to_table_lineage_history(
1654473600000,
1654586220000,
),
snowflake_query.SnowflakeQuery.table_to_table_lineage_history(
1654473600000, 1654586220000, False
),
):
return [
{
"DOWNSTREAM_TABLE_NAME": f"TEST_DB.TEST_SCHEMA.TABLE_{op_idx}",
"UPSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_2",
"UPSTREAM_TABLE_COLUMNS": json.dumps(
[
{"columnId": 0, "columnName": f"COL_{col_idx}"}
for col_idx in range(1, num_cols + 1)
]
),
"DOWNSTREAM_TABLE_COLUMNS": json.dumps(
[
{
"columnId": 0,
"columnName": f"COL_{col_idx}",
"directSources": [
{
"columnName": f"COL_{col_idx}",
"objectDomain": "Table",
"objectId": 0,
"objectName": "TEST_DB.TEST_SCHEMA.TABLE_2",
}
],
}
for col_idx in range(1, num_cols + 1)
]
),
}
for op_idx in range(1, num_ops + 1)
] + [
{
"DOWNSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_1",
"UPSTREAM_TABLE_NAME": "OTHER_DB.OTHER_SCHEMA.TABLE_1",
"UPSTREAM_TABLE_COLUMNS": json.dumps(
[{"columnId": 0, "columnName": "COL_1"}]
),
"DOWNSTREAM_TABLE_COLUMNS": json.dumps(
[
{
"columnId": 0,
"columnName": "COL_1",
"directSources": [
{
"columnName": "COL_1",
"objectDomain": "Table",
"objectId": 0,
"objectName": "OTHER_DB.OTHER_SCHEMA.TABLE_1",
}
],
}
]
),
}
]
elif query in (
snowflake_query.SnowflakeQuery.table_to_table_lineage_history_v2(
start_time_millis=1654473600000,
Expand Down Expand Up @@ -662,46 +599,6 @@ def default_query_results( # noqa: C901
),
}
]
elif query in [
snowflake_query.SnowflakeQuery.view_lineage_history(
1654473600000,
1654586220000,
),
snowflake_query.SnowflakeQuery.view_lineage_history(
1654473600000, 1654586220000, False
),
]:
return [
{
"DOWNSTREAM_TABLE_NAME": "TEST_DB.TEST_SCHEMA.TABLE_1",
"VIEW_NAME": "TEST_DB.TEST_SCHEMA.VIEW_1",
"VIEW_DOMAIN": "VIEW",
"VIEW_COLUMNS": json.dumps(
[
{"columnId": 0, "columnName": f"COL_{col_idx}"}
for col_idx in range(1, num_cols + 1)
]
),
"DOWNSTREAM_TABLE_DOMAIN": "TABLE",
"DOWNSTREAM_TABLE_COLUMNS": json.dumps(
[
{
"columnId": 0,
"columnName": f"COL_{col_idx}",
"directSources": [
{
"columnName": f"COL_{col_idx}",
"objectDomain": "Table",
"objectId": 0,
"objectName": "TEST_DB.TEST_SCHEMA.TABLE_2",
}
],
}
for col_idx in range(1, num_cols + 1)
]
),
}
]
elif query in [
snowflake_query.SnowflakeQuery.view_dependencies_v2(),
snowflake_query.SnowflakeQuery.view_dependencies(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_build_prefix_batches_exceeds_max_batch_size():
]
expected = [
[PrefixGroup(prefix="app", names=["app"], exact_match=True)],
[PrefixGroup(prefix="app", names=["apple", "applet", "application"])],
[PrefixGroup(prefix="appl", names=["apple", "applet", "application"])],
[PrefixGroup(prefix="b", names=["banana", "band", "bandana"])],
[
PrefixGroup(prefix="c", names=["candle", "candy"]),
Expand Down
Loading

0 comments on commit 425b9cb

Please sign in to comment.