From a92c6b2bb0768b33ba479b012d51ef20ae2194f2 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 28 Nov 2024 21:42:40 -0500 Subject: [PATCH] feat(ingest): add tests for colon characters in urns (#11976) --- .../src/datahub/utilities/urn_encoder.py | 3 +- .../tests/unit/urns/test_urn.py | 49 ++++++++++++++++--- 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/urn_encoder.py b/metadata-ingestion/src/datahub/utilities/urn_encoder.py index 88c0a128b8e468..4f19eeff3e70f0 100644 --- a/metadata-ingestion/src/datahub/utilities/urn_encoder.py +++ b/metadata-ingestion/src/datahub/utilities/urn_encoder.py @@ -4,7 +4,8 @@ # NOTE: Frontend relies on encoding these three characters. Specifically, we decode and encode schema fields for column level lineage. # If this changes, make appropriate changes to datahub-web-react/src/app/lineage/utils/columnLineageUtils.ts # We also rely on encoding these exact three characters when generating schemaField urns in our graphQL layer. Update SchemaFieldUtils if this changes. -RESERVED_CHARS = {",", "(", ")"} +# Also see https://datahubproject.io/docs/what/urn/#restrictions +RESERVED_CHARS = {",", "(", ")", "␟"} RESERVED_CHARS_EXTENDED = RESERVED_CHARS.union({"%"}) diff --git a/metadata-ingestion/tests/unit/urns/test_urn.py b/metadata-ingestion/tests/unit/urns/test_urn.py index 1bf48082fec8c9..73badb3d1b4234 100644 --- a/metadata-ingestion/tests/unit/urns/test_urn.py +++ b/metadata-ingestion/tests/unit/urns/test_urn.py @@ -1,6 +1,12 @@ import pytest -from datahub.metadata.urns import DatasetUrn, Urn +from datahub.metadata.urns import ( + CorpUserUrn, + DashboardUrn, + DataPlatformUrn, + DatasetUrn, + Urn, +) from datahub.utilities.urns.error import InvalidUrnError pytestmark = pytest.mark.filterwarnings("ignore::DeprecationWarning") @@ -36,20 +42,51 @@ def test_url_encode_urn() -> None: def test_invalid_urn() -> None: with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc") + Urn.from_string("urn:li:abc") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:") + Urn.from_string("urn:li:abc:") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:()") + Urn.from_string("urn:li:abc:()") with pytest.raises(InvalidUrnError): - Urn.create_from_string("urn:li:abc:(abc,)") + Urn.from_string("urn:li:abc:(abc,)") + + with pytest.raises(InvalidUrnError): + Urn.from_string("urn:li:corpuser:abc)") + + +def test_urn_colon() -> None: + # Colon characters are valid in urns, and should not mess up parsing. + + urn = Urn.from_string( + "urn:li:dashboard:(looker,dashboards.thelook::customer_lookup)" + ) + assert isinstance(urn, DashboardUrn) + + assert DataPlatformUrn.from_string("urn:li:dataPlatform:abc:def") + assert DatasetUrn.from_string( + "urn:li:dataset:(urn:li:dataPlatform:abc:def,table_name,PROD)" + ) + assert Urn.from_string("urn:li:corpuser:foo:bar@example.com") + + # I'm not sure why you'd ever want this, but technically it's a valid urn. + urn = Urn.from_string("urn:li:corpuser::") + assert isinstance(urn, CorpUserUrn) + assert urn.username == ":" + assert urn == CorpUserUrn(":") + + +def test_urn_coercion() -> None: + urn = CorpUserUrn("foo␟bar") + assert urn.urn() == "urn:li:corpuser:foo%E2%90%9Fbar" + + assert urn == Urn.from_string(urn.urn()) def test_urn_type_dispatch() -> None: - urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,prod)") + urn = Urn.from_string("urn:li:dataset:(urn:li:dataPlatform:abc,def,PROD)") assert isinstance(urn, DatasetUrn) with pytest.raises(InvalidUrnError, match="Passed an urn of type corpuser"):