Skip to content

Commit

Permalink
Merge branch 'master' into feature/add-exclusive-project-path-pattern
Browse files Browse the repository at this point in the history
  • Loading branch information
haeniya authored Sep 17, 2024
2 parents 59300c2 + adc6b90 commit 3b91a2d
Show file tree
Hide file tree
Showing 24 changed files with 269 additions and 109 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/docker-unified.yml
Original file line number Diff line number Diff line change
Expand Up @@ -990,7 +990,7 @@ jobs:
DATAHUB_VERSION: ${{ needs.setup.outputs.unique_tag }}
DATAHUB_ACTIONS_IMAGE: ${{ env.DATAHUB_INGESTION_IMAGE }}
ACTIONS_VERSION: ${{ needs.datahub_ingestion_slim_build.outputs.tag || 'head-slim' }}
ACTIONS_EXTRA_PACKAGES: "acryl-datahub-actions[executor]==0.0.13 acryl-datahub-actions==0.0.13 acryl-datahub==0.10.5"
ACTIONS_EXTRA_PACKAGES: "acryl-datahub-actions[executor] acryl-datahub-actions"
ACTIONS_CONFIG: "https://raw.githubusercontent.com/acryldata/datahub-actions/main/docker/config/executor.yaml"
run: |
./smoke-test/run-quickstart.sh
Expand Down
11 changes: 8 additions & 3 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ buildscript {
ext.hadoop3Version = '3.3.6'
ext.kafkaVersion = '5.5.15'
ext.hazelcastVersion = '5.3.6'
ext.ebeanVersion = '12.16.1'
ext.ebeanVersion = '15.5.2'
ext.googleJavaFormatVersion = '1.18.1'
ext.openLineageVersion = '1.19.0'
ext.logbackClassicJava8 = '1.2.12'
Expand Down Expand Up @@ -104,8 +104,8 @@ project.ext.spec = [

project.ext.externalDependency = [
'akkaHttp': 'com.typesafe.akka:akka-http-core_2.12:10.2.10',
'antlr4Runtime': 'org.antlr:antlr4-runtime:4.7.2',
'antlr4': 'org.antlr:antlr4:4.7.2',
'antlr4Runtime': 'org.antlr:antlr4-runtime:4.9.3',
'antlr4': 'org.antlr:antlr4:4.9.3',
'assertJ': 'org.assertj:assertj-core:3.11.1',
'avro': 'org.apache.avro:avro:1.11.3',
'avroCompiler': 'org.apache.avro:avro-compiler:1.11.3',
Expand All @@ -129,8 +129,10 @@ project.ext.externalDependency = [
'dropwizardMetricsCore': 'io.dropwizard.metrics:metrics-core:4.2.3',
'dropwizardMetricsJmx': 'io.dropwizard.metrics:metrics-jmx:4.2.3',
'ebean': 'io.ebean:ebean:' + ebeanVersion,
'ebeanTest': 'io.ebean:ebean-test:' + ebeanVersion,
'ebeanAgent': 'io.ebean:ebean-agent:' + ebeanVersion,
'ebeanDdl': 'io.ebean:ebean-ddl-generator:' + ebeanVersion,
'ebeanQueryBean': 'io.ebean:querybean-generator:' + ebeanVersion,
'elasticSearchRest': 'org.opensearch.client:opensearch-rest-high-level-client:' + elasticsearchVersion,
'elasticSearchJava': 'org.opensearch.client:opensearch-java:2.6.0',
'findbugsAnnotations': 'com.google.code.findbugs:annotations:3.0.1',
Expand Down Expand Up @@ -359,6 +361,9 @@ configure(subprojects.findAll {! it.name.startsWith('spark-lineage')}) {
exclude group: "org.slf4j", module: "slf4j-log4j12"
exclude group: "org.slf4j", module: "slf4j-nop"
exclude group: "org.slf4j", module: "slf4j-ext"

resolutionStrategy.force externalDependency.antlr4Runtime
resolutionStrategy.force externalDependency.antlr4
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public Function<UpgradeContext, UpgradeStepResult> executable() {
}

try {
_server.execute(_server.createSqlUpdate(sqlUpdateStr));
_server.execute(_server.sqlUpdate(sqlUpdateStr));
} catch (Exception e) {
context.report().addLine("Failed to create table metadata_aspect_v2", e);
return new DefaultUpgradeStepResult(id(), DataHubUpgradeState.FAILED);
Expand Down
2 changes: 2 additions & 0 deletions docker/datahub-ingestion-base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@ RUN apt-get update && apt-get upgrade -y \
krb5-user \
krb5-config \
libkrb5-dev \
librdkafka-dev \
wget \
curl \
zip \
unzip \
ldap-utils \
Expand Down
51 changes: 43 additions & 8 deletions docs/how/restore-indices.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,24 +7,39 @@ When a new version of the aspect gets ingested, GMS initiates an MAE event for t
the search and graph indices. As such, we can fetch the latest version of each aspect in the local database and produce
MAE events corresponding to the aspects to restore the search and graph indices.

By default, restoring the indices from the local database will not remove any existing documents in
the search and graph indices that no longer exist in the local database, potentially leading to inconsistencies
between the search and graph indices and the local database.

## Quickstart

If you're using the quickstart images, you can use the `datahub` cli to restore indices.
If you're using the quickstart images, you can use the `datahub` cli to restore the indices.

```
```shell
datahub docker quickstart --restore-indices
```
See [this section](../quickstart.md#restoring-only-the-index-use-with-care) for more information.

:::info
Using the `datahub` CLI to restore the indices when using the quickstart images will also clear the search and graph indices before restoring.

See [this section](../quickstart.md#restore-datahub) for more information.

## Docker-compose

If you are on a custom docker-compose deployment, run the following command (you need to checkout [the source repository](https://github.com/datahub-project/datahub)) from the root of the repo to send MAE for each aspect in the Local DB.
If you are on a custom docker-compose deployment, run the following command (you need to checkout [the source repository](https://github.com/datahub-project/datahub)) from the root of the repo to send MAE for each aspect in the local database.

```
```shell
./docker/datahub-upgrade/datahub-upgrade.sh -u RestoreIndices
```

If you need to clear the search and graph indices before restoring, add `-a clean` to the end of the command.
:::info
By default this command will not clear the search and graph indices before restoring, thous potentially leading to inconsistencies between the local database and the indices, in case aspects were previously deleted in the local database but were not removed from the correponding index.

If you need to clear the search and graph indices before restoring, add `-a clean` to the end of the command. Please take note that the search and graph services might not be fully functional during reindexing when the indices are cleared.

```shell
./docker/datahub-upgrade/datahub-upgrade.sh -u RestoreIndices -a clean
```

Refer to this [doc](../../docker/datahub-upgrade/README.md#environment-variables) on how to set environment variables
for your environment.
Expand All @@ -44,11 +59,31 @@ If not, deploy latest helm charts to use this functionality.

Once restore indices job template has been deployed, run the following command to start a job that restores indices.

```
```shell
kubectl create job --from=cronjob/datahub-datahub-restore-indices-job-template datahub-restore-indices-adhoc
```

Once the job completes, your indices will have been restored.
Once the job completes, your indices will have been restored.

:::info
By default the restore indices job template will not clear the search and graph indices before restoring, thous potentially leading to inconsistencies between the local database and the indices, in case aspects were previously deleted in the local database but were not removed from the correponding index.

If you need to clear the search and graph indices before restoring, modify the `values.yaml` for your deployment and overwrite the default arguments of the restore indices job template to include the `-a clean` argument. Please take note that the search and graph services might not be fully functional during reindexing when the indices are cleared.

```yaml
datahubUpgrade:
restoreIndices:
image:
args:
- "-u"
- "RestoreIndices"
- "-a"
- "batchSize=1000" # default value of datahubUpgrade.batchSize
- "-a"
- "batchDelayMs=100" # default value of datahubUpgrade.batchDelayMs
- "-a"
- "clean"
```
## Through API
Expand Down
20 changes: 4 additions & 16 deletions metadata-ingestion/src/datahub/cli/docker_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
from datahub.telemetry import telemetry
from datahub.upgrade import upgrade
from datahub.utilities.perf_timer import PerfTimer
from datahub.utilities.sample_data import BOOTSTRAP_MCES_FILE, download_sample_data

logger = logging.getLogger(__name__)
_ClickPositiveInt = click.IntRange(min=1)
Expand Down Expand Up @@ -957,11 +956,6 @@ def valid_restore_options(


@docker.command()
@click.option(
"--path",
type=click.Path(exists=True, dir_okay=False),
help=f"The MCE json file to ingest. Defaults to downloading {BOOTSTRAP_MCES_FILE} from GitHub",
)
@click.option(
"--token",
type=str,
Expand All @@ -970,13 +964,9 @@ def valid_restore_options(
help="The token to be used when ingesting, used when datahub is deployed with METADATA_SERVICE_AUTH_ENABLED=true",
)
@telemetry.with_telemetry()
def ingest_sample_data(path: Optional[str], token: Optional[str]) -> None:
def ingest_sample_data(token: Optional[str]) -> None:
"""Ingest sample data into a running DataHub instance."""

if path is None:
click.echo("Downloading sample data...")
path = str(download_sample_data())

# Verify that docker is up.
status = check_docker_quickstart()
if not status.is_ok():
Expand All @@ -989,10 +979,8 @@ def ingest_sample_data(path: Optional[str], token: Optional[str]) -> None:
click.echo("Starting ingestion...")
recipe: dict = {
"source": {
"type": "file",
"config": {
"path": path,
},
"type": "demo-data",
"config": {},
},
"sink": {
"type": "datahub-rest",
Expand All @@ -1003,7 +991,7 @@ def ingest_sample_data(path: Optional[str], token: Optional[str]) -> None:
if token is not None:
recipe["sink"]["config"]["token"] = token

pipeline = Pipeline.create(recipe, report_to=None)
pipeline = Pipeline.create(recipe)
pipeline.run()
ret = pipeline.pretty_print_summary()
sys.exit(ret)
Expand Down
7 changes: 6 additions & 1 deletion metadata-ingestion/src/datahub/ingestion/source/nifi.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,7 +638,12 @@ def create_nifi_flow(self):
)
nifi_version: Optional[str] = None
if about_response.ok:
nifi_version = about_response.json().get("about", {}).get("version")
try:
nifi_version = about_response.json().get("about", {}).get("version")
except Exception as e:
logger.error(
f"Unable to parse about response from Nifi: {about_response} due to {e}"
)
else:
logger.warning("Failed to fetch version for nifi")
cluster_response = self.session.get(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def is_dataset_pattern_allowed(
SnowflakeObjectDomain.EXTERNAL_TABLE,
SnowflakeObjectDomain.VIEW,
SnowflakeObjectDomain.MATERIALIZED_VIEW,
SnowflakeObjectDomain.ICEBERG_TABLE,
):
return False
if _is_sys_table(dataset_name):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,11 @@ for jarFile in ${jarFiles}; do
grep -v "MetadataChangeProposal.avsc" |\
grep -v "io.openlineage" |\
grep -v "org.apache" |\
grep -v "aix"
grep -v "aix" |\
grep -v "scala" |\
grep -v "io/micrometer/" |\
grep -v "library.properties|rootdoc.txt" \|
grep -v "com/ibm/.*"


if [ $? -ne 0 ]; then
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ jar -tvf $jarFile |\
grep -v "aix" |\
grep -v "com/sun/" |\
grep -v "VersionInfo.java" |\
grep -v "mime.types"
grep -v "mime.types" |\
grep -v "com/ibm/.*" |\
grep -v "org/glassfish/" |\
grep -v "LICENSE"

if [ $? -ne 0 ]; then
echo "✅ No unexpected class paths found in ${jarFile}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ jar -tvf $jarFile |\
grep -v "library.properties" |\
grep -v "rootdoc.txt" |\
grep -v "VersionInfo.java" |\
grep -v "mime.types"
grep -v "mime.types" |\
grep -v "com/ibm/.*"


if [ $? -ne 0 ]; then
Expand Down
18 changes: 8 additions & 10 deletions metadata-io/build.gradle
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
plugins {
id 'java-library'
id 'pegasus'
id 'io.ebean' version "${ebeanVersion}" // Use the latest version from global build.gradle
}

configurations {
Expand Down Expand Up @@ -50,8 +51,9 @@ dependencies {
runtimeOnly externalDependency.jna
api externalDependency.kafkaClients
api externalDependency.ebean
enhance externalDependency.ebeanAgent
annotationProcessor externalDependency.ebeanQueryBean
implementation externalDependency.ebeanDdl
implementation externalDependency.ebeanAgent
implementation externalDependency.opentelemetryAnnotations
implementation externalDependency.resilience4j
// Newer Spring libraries require JDK17 classes, allow for JDK11
Expand Down Expand Up @@ -89,6 +91,7 @@ dependencies {
testImplementation externalDependency.lombok
testImplementation externalDependency.springBootTest
testImplementation spec.product.pegasus.restliServer
testImplementation externalDependency.ebeanTest

// logback >=1.3 required due to `testcontainers` only
testImplementation 'ch.qos.logback:logback-classic:1.4.7'
Expand Down Expand Up @@ -139,17 +142,12 @@ test {
testLogging.exceptionFormat = 'full'
}

tasks.withType(Test) {
enableAssertions = false
ebean {
debugLevel = 1 // 0 - 9
}

project.compileJava {
doLast {
ant.taskdef(name: 'ebean', classname: 'io.ebean.enhance.ant.AntEnhanceTask',
classpath: project.configurations.enhance.asPath)
ant.ebean(classSource: "${project.buildDir}/classes/java/main", packages: 'com.linkedin.metadata.entity.ebean',
transformArgs: 'debug=1')
}
tasks.withType(Test) {
enableAssertions = false
}

clean {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
import com.linkedin.util.Pair;
import io.datahubproject.metadata.context.OperationContext;
import io.opentelemetry.extension.annotations.WithSpan;
import jakarta.persistence.EntityNotFoundException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
Expand All @@ -111,7 +112,6 @@
import java.util.stream.StreamSupport;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import javax.persistence.EntityNotFoundException;
import lombok.Getter;
import lombok.extern.slf4j.Slf4j;

Expand Down
Loading

0 comments on commit 3b91a2d

Please sign in to comment.