Skip to content

Commit

Permalink
Merge branch 'datahub-project:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
sgomezvillamor authored Jan 3, 2025
2 parents 0c32306 + 1190dd9 commit 6fcce4d
Show file tree
Hide file tree
Showing 54 changed files with 4,485 additions and 1,105 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/airflow-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,8 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./build/coverage-reports/
fail_ci_if_error: false
flags: airflow,airflow-${{ matrix.extra_pip_extras }}
name: pytest-airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_requirements }}
flags: airflow-${{ matrix.python-version }}-${{ matrix.extra_pip_extras }}
name: pytest-airflow
verbose: true

event-file:
Expand Down
9 changes: 3 additions & 6 deletions .github/workflows/metadata-ingestion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ jobs:
"testIntegrationBatch1",
"testIntegrationBatch2",
]
include:
- python-version: "3.8"
- python-version: "3.11"
fail-fast: false
steps:
- name: Free up disk space
Expand Down Expand Up @@ -92,14 +89,14 @@ jobs:
**/junit.*.xml
!**/binary/**
- name: Upload coverage to Codecov
if: ${{ always() && matrix.python-version == '3.10' }}
if: ${{ always() }}
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./build/coverage-reports/
fail_ci_if_error: false
flags: pytest-${{ matrix.command }}
name: pytest-${{ matrix.python-version }}-${{ matrix.command }}
flags: ingestion-${{ matrix.python-version }}-${{ matrix.command }}
name: pytest-ingestion
verbose: true

event-file:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/prefect-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ jobs:
token: ${{ secrets.CODECOV_TOKEN }}
directory: ./build/coverage-reports/
fail_ci_if_error: false
flags: prefect,prefect-${{ matrix.python-version }}
name: pytest-prefect-${{ matrix.python-version }}
flags: prefect-${{ matrix.python-version }}
name: pytest-prefect
verbose: true

event-file:
Expand Down
11 changes: 9 additions & 2 deletions datahub-frontend/app/auth/AuthModule.java
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,12 @@ protected OperationContext provideOperationContext(
final Authentication systemAuthentication,
final ConfigurationProvider configurationProvider) {
ActorContext systemActorContext =
ActorContext.builder().systemAuth(true).authentication(systemAuthentication).build();
ActorContext.builder()
.systemAuth(true)
.authentication(systemAuthentication)
.enforceExistenceEnabled(
configurationProvider.getAuthentication().isEnforceExistenceEnabled())
.build();
OperationContextConfig systemConfig =
OperationContextConfig.builder()
.viewAuthorizationConfiguration(configurationProvider.getAuthorization().getView())
Expand All @@ -197,7 +202,9 @@ protected OperationContext provideOperationContext(
.entityRegistryContext(EntityRegistryContext.builder().build(EmptyEntityRegistry.EMPTY))
.validationContext(ValidationContext.builder().alternateValidation(false).build())
.retrieverContext(RetrieverContext.EMPTY)
.build(systemAuthentication);
.build(
systemAuthentication,
configurationProvider.getAuthentication().isEnforceExistenceEnabled());
}

@Provides
Expand Down
4 changes: 4 additions & 0 deletions datahub-frontend/app/config/ConfigurationProvider.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package config;

import com.datahub.authentication.AuthenticationConfiguration;
import com.datahub.authorization.AuthorizationConfiguration;
import com.linkedin.metadata.config.VisualConfiguration;
import com.linkedin.metadata.config.cache.CacheConfiguration;
Expand Down Expand Up @@ -30,4 +31,7 @@ public class ConfigurationProvider {

/** Configuration for authorization */
private AuthorizationConfiguration authorization;

/** Configuration for authentication */
private AuthenticationConfiguration authentication;
}
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,8 @@ protected OperationContext javaSystemOperationContext(
ValidationContext.builder()
.alternateValidation(
configurationProvider.getFeatureFlags().isAlternateMCPValidation())
.build());
.build(),
true);

entityServiceAspectRetriever.setSystemOperationContext(systemOperationContext);
systemGraphRetriever.setSystemOperationContext(systemOperationContext);
Expand Down
30 changes: 30 additions & 0 deletions docs/authentication/guides/add-users.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';

# Onboarding Users to DataHub

New user accounts can be provisioned on DataHub in 3 ways:
Expand Down Expand Up @@ -94,6 +97,11 @@ using this mechanism. It is highly recommended that admins change or remove the

## Adding new users using a user.props file

:::NOTE
Adding users via the `user.props` will require disabling existence checks on GMS using the `METADATA_SERVICE_AUTH_ENFORCE_EXISTENCE_ENABLED=false` environment variable or using the API to enable the user prior to login.
The directions below demonstrate using the API to enable the user.
:::

To define a set of username / password combinations that should be allowed to log in to DataHub (in addition to the root 'datahub' user),
create a new file called `user.props` at the file path `${HOME}/.datahub/plugins/frontend/auth/user.props` within the `datahub-frontend-react` container
or pod.
Expand All @@ -107,6 +115,28 @@ janesmith:janespassword
johndoe:johnspassword
```

In order to enable the user access with the credential defined in `user.props`, set the `status` aspect on the user with an Admin user. This can be done using an API call or via the [OpenAPI UI interface](/docs/api/openapi/openapi-usage-guide.md).

<Tabs>
<TabItem value="openapi" label="OpenAPI" default>

Example enabling login for the `janesmith` user from the example above. Make sure to update the example with your access token.

```shell
curl -X 'POST' \
'http://localhost:9002/openapi/v3/entity/corpuser/urn%3Ali%3Acorpuser%3Ajanesmith/status?async=false&systemMetadata=false&createIfEntityNotExists=false&createIfNotExists=true' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer <access token>' \
-d '{
"value": {
"removed": false
}
}'
```
</TabItem>
</Tabs>

Once you've saved the file, simply start the DataHub containers & navigate to `http://localhost:9002/login`
to verify that your new credentials work.

Expand Down
15 changes: 15 additions & 0 deletions docs/how/delete-metadata.md
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,21 @@ The start and end time fields filter on the `timestampMillis` field of the times
- `ddddddddd` (e.g. `1684384045`): a unix timestamp
- `min`, `max`, `now`: special keywords

#### Undo-ing soft deletion of entities

You can restore soft-deleted entities using the `undo-by-filter` command. This reverts the effect of a soft delete.

```shell
# Restore (un-soft-delete) a single soft-deleted entity
datahub delete undo-by-filter --urn "urn:li:dataset:(urn:li:dataPlatform:hive,fct_users_deleted,PROD)"

# Restore all soft-deleted entities from a specific platform
datahub delete undo-by-filter --platform snowflake

# You can adjust the batch size (default 3000, max 10000) for better performance
datahub delete undo-by-filter --platform snowflake --batch-size 5000
```

## Delete CLI Examples

:::note
Expand Down
2 changes: 2 additions & 0 deletions docs/how/updating-datahub.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe

## Next
- #12191 - Configs `include_view_lineage` and `include_view_column_lineage` are removed from snowflake ingestion source. View and External Table DDL lineage will always be ingested when definitions are available.
- #12181 - Configs `include_view_lineage`, `include_view_column_lineage` and `lineage_parse_view_ddl` are removed from bigquery ingestion source. View and Snapshot lineage will always be ingested when definitions are available.
- #11560 - The PowerBI ingestion source configuration option include_workspace_name_in_dataset_urn determines whether the workspace name is included in the PowerBI dataset's URN.<br/> PowerBI allows to have identical name of semantic model and their tables across the workspace, It will overwrite the semantic model in-case of multi-workspace ingestion.<br/>
Entity urn with `include_workspace_name_in_dataset_urn: false`

Expand Down Expand Up @@ -65,6 +66,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe
changed to NOT fill out `created` and `lastModified` auditstamps by default
for input and output dataset edges. This should not have any user-observable
impact (time-based lineage viz will still continue working based on observed time), but could break assumptions previously being made by clients.
- #12158 - Users provisioned with `user.props` will need to be enabled before login in order to be granted access to DataHub.

### Potential Downtime

Expand Down
15 changes: 9 additions & 6 deletions metadata-ingestion/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@ task lintFix(type: Exec, dependsOn: installDev) {
"mypy --show-traceback --show-error-codes src/ tests/ examples/"
}

def pytest_default_env = "PYTHONDEVMODE=1"
def pytest_default_args = "--durations=30 -vv --continue-on-collection-errors"

task testQuick(type: Exec, dependsOn: [installDev, ':metadata-models:generateJsonSchema']) {
// We can't enforce the coverage requirements if we run a subset of the tests.
inputs.files(project.fileTree(dir: "src/", include: "**/*.py"))
Expand All @@ -135,7 +138,7 @@ task testQuick(type: Exec, dependsOn: [installDev, ':metadata-models:generateJso
def cvg_arg = get_coverage_args("quick")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"pytest ${cvg_arg} tests/unit --random-order --durations=20 -m 'not integration' -vv --continue-on-collection-errors --junit-xml=junit.quick.xml"
"${pytest_default_env} pytest ${cvg_arg} tests/unit ${pytest_default_args} --random-order -m 'not integration' --junit-xml=junit.quick.xml"
}

task installDevTest(type: Exec, dependsOn: [install]) {
Expand All @@ -155,7 +158,7 @@ task testSingle(dependsOn: [installDevTest]) {
if (testFile != 'unknown') {
exec {
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && pytest ${testFile}"
"source ${venv_name}/bin/activate && ${pytest_default_env} pytest ${testFile} ${pytest_default_args}"
}
} else {
throw new GradleException("No file provided. Use -PtestFile=<test_file>")
Expand All @@ -167,25 +170,25 @@ task testIntegrationBatch0(type: Exec, dependsOn: [installDevTest]) {
def cvg_arg = get_coverage_args("intBatch0")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"pytest ${cvg_arg} --durations=50 -m 'integration_batch_0' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch0.xml"
"${pytest_default_env} pytest ${cvg_arg} ${pytest_default_args} -m 'integration_batch_0' --junit-xml=junit.integrationbatch0.xml"
}
task testIntegrationBatch1(type: Exec, dependsOn: [installDevTest]) {
def cvg_arg = get_coverage_args("intBatch1")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"pytest ${cvg_arg} --durations=50 -m 'integration_batch_1' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch1.xml"
"${pytest_default_env} pytest ${cvg_arg} ${pytest_default_args} -m 'integration_batch_1' --junit-xml=junit.integrationbatch1.xml"
}
task testIntegrationBatch2(type: Exec, dependsOn: [installDevTest]) {
def cvg_arg = get_coverage_args("intBatch2")
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"pytest ${cvg_arg} --durations=20 -m 'integration_batch_2' -vv --continue-on-collection-errors --junit-xml=junit.integrationbatch2.xml"
"${pytest_default_env} pytest ${cvg_arg} ${pytest_default_args} -m 'integration_batch_2' --junit-xml=junit.integrationbatch2.xml"
}

task testFull(type: Exec, dependsOn: [installDevTest]) {
commandLine 'bash', '-c',
"source ${venv_name}/bin/activate && set -x && " +
"pytest --durations=50 -vv --continue-on-collection-errors --junit-xml=junit.full.xml"
"${pytest_default_env} pytest ${pytest_default_args} --junit-xml=junit.full.xml"
}

task specGen(type: Exec, dependsOn: [codegen, installDevTest]) {
Expand Down
6 changes: 3 additions & 3 deletions metadata-ingestion/examples/mce_files/bootstrap_mce.json
Original file line number Diff line number Diff line change
Expand Up @@ -3394,7 +3394,7 @@
"changeType":"UPSERT",
"aspectName":"datasetProfile",
"aspect":{
"value":"{\"timestampMillis\": 1723488954865, \"rowCount\": 4500, \"columnCount\": 2, \"sizeInBytes\": 842000200000, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"false\"]}]}",
"value":"{\"timestampMillis\": 1735823280000, \"rowCount\": 4500, \"columnCount\": 2, \"sizeInBytes\": 842000200000, \"fieldProfiles\": [{\"fieldPath\": \"field_foo\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"true\", \"false\"]}, {\"fieldPath\": \"field_bar\", \"uniqueCount\": 2, \"uniqueProportion\": 0.00044, \"nullCount\": 0, \"nullProportion\": 0.0, \"sampleValues\": [\"false\"]}]}",
"contentType":"application/json"
},
"systemMetadata":null
Expand All @@ -3418,7 +3418,7 @@
"changeType":"UPSERT",
"aspectName":"operation",
"aspect":{
"value":"{\"timestampMillis\": 1679515693000, \"operationType\": \"INSERT\", \"lastUpdatedTimestamp\": 1629097200001 }",
"value":"{\"timestampMillis\": 1711138093000, \"operationType\": \"INSERT\", \"lastUpdatedTimestamp\": 1629097200001 }",
"contentType":"application/json"
},
"systemMetadata":null
Expand Down Expand Up @@ -3584,7 +3584,7 @@
"changeType": "UPSERT",
"aspectName": "assertionRunEvent",
"aspect": {
"value": "{\"timestampMillis\": 1675155843000, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"{\\\"category\\\": \\\"catA\\\"}\"}, \"runId\": \"2021-12-28T12:00:00Z\", \"assertionUrn\": \"urn:li:assertion:358c683782c93c2fc2bd4bdd4fdb0153\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)\", \"batchSpec\": {\"customProperties\": {\"data_asset_name\": \"data__foo1__asset\", \"datasource_name\": \"my_hive_datasource\"}, \"nativeBatchId\": \"c8f12129f2e57412eee5fb8656154d05\", \"limit\": 10}, \"status\": \"COMPLETE\", \"result\": {\"type\": \"SUCCESS\", \"nativeResults\": {}}}",
"value": "{\"timestampMillis\": 1730554659000, \"partitionSpec\": {\"type\": \"PARTITION\", \"partition\": \"{\\\"category\\\": \\\"catA\\\"}\"}, \"runId\": \"2021-12-28T12:00:00Z\", \"assertionUrn\": \"urn:li:assertion:358c683782c93c2fc2bd4bdd4fdb0153\", \"asserteeUrn\": \"urn:li:dataset:(urn:li:dataPlatform:hive,SampleHiveDataset,PROD)\", \"batchSpec\": {\"customProperties\": {\"data_asset_name\": \"data__foo1__asset\", \"datasource_name\": \"my_hive_datasource\"}, \"nativeBatchId\": \"c8f12129f2e57412eee5fb8656154d05\", \"limit\": 10}, \"status\": \"COMPLETE\", \"result\": {\"type\": \"SUCCESS\", \"nativeResults\": {}}}",
"contentType": "application/json"
},
"systemMetadata": null
Expand Down
6 changes: 3 additions & 3 deletions metadata-ingestion/src/datahub/emitter/mce_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import typing_inspect
from avrogen.dict_wrapper import DictWrapper
from typing_extensions import assert_never

from datahub.emitter.enum_helpers import get_enum_options
from datahub.metadata.schema_classes import (
Expand Down Expand Up @@ -269,9 +270,8 @@ def make_owner_urn(owner: str, owner_type: OwnerType) -> str:
return make_user_urn(owner)
elif owner_type == OwnerType.GROUP:
return make_group_urn(owner)
# This should pretty much never happen.
# TODO: With Python 3.11, we can use typing.assert_never() here.
return f"urn:li:{owner_type.value}:{owner}"
else:
assert_never(owner_type)


def make_ownership_type_urn(type: str) -> str:
Expand Down
48 changes: 36 additions & 12 deletions metadata-ingestion/src/datahub/emitter/mcp_patch_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,19 @@
import time
from collections import defaultdict
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Sequence, Union
from typing import (
Any,
Dict,
List,
Literal,
Optional,
Protocol,
Tuple,
Union,
runtime_checkable,
)

from typing_extensions import LiteralString

from datahub.emitter.aspect import JSON_PATCH_CONTENT_TYPE
from datahub.emitter.serialization_helper import pre_json_transform
Expand All @@ -19,25 +31,36 @@
from datahub.utilities.urns.urn import guess_entity_type


@runtime_checkable
class SupportsToObj(Protocol):
def to_obj(self) -> Any:
...


def _recursive_to_obj(obj: Any) -> Any:
if isinstance(obj, list):
return [_recursive_to_obj(v) for v in obj]
elif hasattr(obj, "to_obj"):
elif isinstance(obj, SupportsToObj):
return obj.to_obj()
else:
return obj


PatchPath = Tuple[Union[LiteralString, Urn], ...]
PatchOp = Literal["add", "remove", "replace"]


@dataclass
class _Patch:
op: str # one of ['add', 'remove', 'replace']; we don't support move, copy or test
path: str
class _Patch(SupportsToObj):
op: PatchOp
path: PatchPath
value: Any

def to_obj(self) -> Dict:
quoted_path = "/" + "/".join(MetadataPatchProposal.quote(p) for p in self.path)
return {
"op": self.op,
"path": self.path,
"path": quoted_path,
"value": _recursive_to_obj(self.value),
}

Expand All @@ -63,15 +86,16 @@ def __init__(

# Json Patch quoting based on https://jsonpatch.com/#json-pointer
@classmethod
def quote(cls, value: str) -> str:
return value.replace("~", "~0").replace("/", "~1")
def quote(cls, value: Union[str, Urn]) -> str:
return str(value).replace("~", "~0").replace("/", "~1")

def _add_patch(
self, aspect_name: str, op: str, path: Union[str, Sequence[str]], value: Any
self,
aspect_name: str,
op: PatchOp,
path: PatchPath,
value: Any,
) -> None:
if not isinstance(path, str):
path = "/" + "/".join(self.quote(p) for p in path)

# TODO: Validate that aspectName is a valid aspect for this entityType
self.patches[aspect_name].append(_Patch(op, path, value))

Expand Down
Loading

0 comments on commit 6fcce4d

Please sign in to comment.