Skip to content

Commit

Permalink
fix(ingest): consistent fingerprint for sql parsing aggregator (#12239)
Browse files Browse the repository at this point in the history
  • Loading branch information
mayurinehate authored Jan 3, 2025
1 parent f9e2c49 commit fba0996
Show file tree
Hide file tree
Showing 10 changed files with 96 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
ColumnRef,
DownstreamColumnRef,
)
from datahub.sql_parsing.sqlglot_utils import get_query_fingerprint
from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedList
from datahub.utilities.perf_timer import PerfTimer

Expand Down Expand Up @@ -475,10 +476,11 @@ def _parse_audit_log_row(

entry = PreparsedQuery(
# Despite having Snowflake's fingerprints available, our own fingerprinting logic does a better
# job at eliminating redundant / repetitive queries. As such, we don't include the fingerprint
# here so that the aggregator auto-generates one.
# query_id=res["query_fingerprint"],
query_id=None,
# job at eliminating redundant / repetitive queries. As such, we include the fast fingerprint
# here
query_id=get_query_fingerprint(
res["query_text"], self.identifiers.platform, fast=True
),
query_text=res["query_text"],
upstreams=upstreams,
downstream=downstream,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,20 @@ def default_user_urn_builder(email: str) -> str:
return builder.make_user_urn(email.split("@")[0])


def extract_user_email(user: str) -> Optional[str]:
"""Extracts user email from user input
>>> extract_user_email('urn:li:corpuser:[email protected]')
'[email protected]'
>>> extract_user_email('urn:li:corpuser:abc')
>>> extract_user_email('[email protected]')
'[email protected]'
"""
if user.startswith(("urn:li:corpuser:", "urn:li:corpGroup:")):
user = user.split(":")[-1]
return user if "@" in user else None


def make_usage_workunit(
bucket_start_time: datetime,
resource: ResourceType,
Expand Down Expand Up @@ -104,7 +118,7 @@ def make_usage_workunit(
DatasetUserUsageCountsClass(
user=user_urn_builder(user),
count=count,
userEmail=user if "@" in user else None,
userEmail=extract_user_email(user),
)
for user, count in user_freq
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def id(self) -> str:

@dataclasses.dataclass
class PreparsedQuery:
# If not provided, we will generate one using the fast fingerprint generator.
# If not provided, we will generate one using the fingerprint generator.
query_id: Optional[QueryId]

query_text: str
Expand Down Expand Up @@ -622,7 +622,6 @@ def add_known_query_lineage(
query_fingerprint = get_query_fingerprint(
known_query_lineage.query_text,
platform=self.platform.platform_name,
fast=True,
)
formatted_query = self._maybe_format_query(known_query_lineage.query_text)

Expand Down Expand Up @@ -848,7 +847,6 @@ def add_preparsed_query(
query_fingerprint = get_query_fingerprint(
parsed.query_text,
platform=self.platform.platform_name,
fast=True,
)

# Format the query.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.bar,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
}
],
"fineGrainedLineages": [
Expand All @@ -32,7 +32,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -44,7 +44,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -56,15 +56,15 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
}
]
}
}
},
{
"entityType": "query",
"entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f",
"entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -87,7 +87,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f",
"entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -114,7 +114,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f",
"entityUrn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand All @@ -137,7 +137,7 @@
},
"operationType": "INSERT",
"customProperties": {
"query_urn": "urn:li:query:6ed1d12fbf2ccc8138ceec08cc35b981030d6d004bfad9743c7afd84260fa63f"
"query_urn": "urn:li:query:02e2ec36678bea2a8c4c855fed5255d087cfeb2710d326e95fd9b48a9c4fc0ae"
},
"lastUpdatedTimestamp": 20000
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_staging,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
"query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
}
],
"fineGrainedLineages": [
Expand All @@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
"query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -159,7 +159,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
"query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -171,15 +171,15 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),c)"
],
"confidenceScore": 1.0,
"query": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4"
"query": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b"
}
]
}
}
},
{
"entityType": "query",
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -202,7 +202,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -229,7 +229,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:88d742bcc0216d6ccb50c7430d1d97494d5dfcfa90160ffa123108844ad261e4",
"entityUrn": "urn:li:query:07a307ad99d3c4a7e54d20c004a4f2d52496f3f5283b33013f80e6323700d97b",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@
},
"dataset": "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo_dep,PROD)",
"type": "TRANSFORMED",
"query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332"
"query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e"
}
],
"fineGrainedLineages": [
Expand All @@ -147,7 +147,7 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),a)"
],
"confidenceScore": 0.2,
"query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332"
"query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e"
},
{
"upstreamType": "FIELD_SET",
Expand All @@ -159,15 +159,15 @@
"urn:li:schemaField:(urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.foo,PROD),b)"
],
"confidenceScore": 0.2,
"query": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332"
"query": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e"
}
]
}
}
},
{
"entityType": "query",
"entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332",
"entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e",
"changeType": "UPSERT",
"aspectName": "queryProperties",
"aspect": {
Expand All @@ -190,7 +190,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332",
"entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e",
"changeType": "UPSERT",
"aspectName": "querySubjects",
"aspect": {
Expand All @@ -217,7 +217,7 @@
},
{
"entityType": "query",
"entityUrn": "urn:li:query:composite_2efc2a13ee673ccf7b195f8f2c0e4ba0570194d8200c3c20b1eb7e8ca4fb4332",
"entityUrn": "urn:li:query:composite_c035c933cc4ce5cf8a111bcaf419b8e66a1e41853bb154ff9aaa24cd00ecf51e",
"changeType": "UPSERT",
"aspectName": "dataPlatformInstance",
"aspect": {
Expand Down
Loading

0 comments on commit fba0996

Please sign in to comment.