Merge branch 'datahub-project:master' into master

acryldata · Jun 12, 2024 · 14890e0 · 14890e0
2 parents b336562 + 1d4977c
commit 14890e0
Show file tree

Hide file tree

Showing 48 changed files with 1,387 additions and 169 deletions.
diff --git a/.gitignore b/.gitignore
@@ -126,3 +126,6 @@ metadata-service/war/bin/
 metadata-utils/bin/
 test-models/bin/
 
+datahub-executor/
+datahub-integrations-service/
+metadata-ingestion-modules/acryl-cloud
diff --git a/datahub-web-react/src/images/db2logo.png b/datahub-web-react/src/images/db2logo.png
diff --git a/docs-website/build.gradle b/docs-website/build.gradle
@@ -148,8 +148,12 @@ clean {
   delete 'tmp'
   delete 'build'
   delete 'just'
+  delete 'sphinx/venv'
+  delete 'sphinx/_build'
+  delete 'versioned_docs'
   delete fileTree(dir: 'genDocs', exclude: '.gitignore')
   delete fileTree(dir: 'docs', exclude: '.gitignore')
+  delete fileTree(dir: 'genStatic', exclude: '.gitignore')
   delete 'graphql/combined.graphql'
   yarnClear
 }

diff --git a/docs-website/filterTagIndexes.json b/docs-website/filterTagIndexes.json
@@ -67,7 +67,7 @@
       }
     },
     {
-      "Path": "docs/generated/ingestion/sources/csv",
+      "Path": "docs/generated/ingestion/sources/csv-enricher",
       "imgPath": "img/datahub-logo-color-mark.svg",
       "Title": "CSV",
       "Description": "An ingestion source for enriching metadata provided in CSV format provided by DataHub",
@@ -177,7 +177,7 @@
       }
     },
     {
-      "Path": "docs/generated/ingestion/sources/file",
+      "Path": "docs/generated/ingestion/sources/metadata-file",
       "imgPath": "img/datahub-logo-color-mark.svg",
       "Title": "File",
       "Description": "An ingestion source for single files provided by DataHub",

diff --git a/docs/cli.md b/docs/cli.md
@@ -655,8 +655,8 @@ We use a plugin architecture so that you can install only the dependencies you a
 Please see our [Integrations page](https://datahubproject.io/integrations) if you want to filter on the features offered by each source.
 
 | Plugin Name                                                                                    | Install Command                                            | Provides                                |
-| ---------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | --------------------------------------- |
-| [file](./generated/ingestion/sources/file.md)                                                  | _included by default_                                      | File source and sink                    |
+|------------------------------------------------------------------------------------------------| ---------------------------------------------------------- | --------------------------------------- |
+| [metadata-file](./generated/ingestion/sources/metadata-file.md)                                | _included by default_                                      | File source and sink                    |
 | [athena](./generated/ingestion/sources/athena.md)                                              | `pip install 'acryl-datahub[athena]'`                      | AWS Athena source                       |
 | [bigquery](./generated/ingestion/sources/bigquery.md)                                          | `pip install 'acryl-datahub[bigquery]'`                    | BigQuery source                         |
 | [datahub-lineage-file](./generated/ingestion/sources/file-based-lineage.md)                    | _no additional dependencies_                               | Lineage File source                     |
@@ -696,12 +696,12 @@ Please see our [Integrations page](https://datahubproject.io/integrations) if yo
 
 ### Sinks
 
-| Plugin Name                                                 | Install Command                              | Provides                   |
-| ----------------------------------------------------------- | -------------------------------------------- | -------------------------- |
-| [file](../metadata-ingestion/sink_docs/file.md)             | _included by default_                        | File source and sink       |
-| [console](../metadata-ingestion/sink_docs/console.md)       | _included by default_                        | Console sink               |
-| [datahub-rest](../metadata-ingestion/sink_docs/datahub.md)  | `pip install 'acryl-datahub[datahub-rest]'`  | DataHub sink over REST API |
-| [datahub-kafka](../metadata-ingestion/sink_docs/datahub.md) | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka    |
+| Plugin Name                                                       | Install Command                              | Provides                   |
+|-------------------------------------------------------------------| -------------------------------------------- | -------------------------- |
+| [metadata-file](../metadata-ingestion/sink_docs/metadata-file.md) | _included by default_                        | File source and sink       |
+| [console](../metadata-ingestion/sink_docs/console.md)             | _included by default_                        | Console sink               |
+| [datahub-rest](../metadata-ingestion/sink_docs/datahub.md)        | `pip install 'acryl-datahub[datahub-rest]'`  | DataHub sink over REST API |
+| [datahub-kafka](../metadata-ingestion/sink_docs/datahub.md)       | `pip install 'acryl-datahub[datahub-kafka]'` | DataHub sink over Kafka    |
 
 These plugins can be mixed and matched as desired. For example:
 

diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md
@@ -484,7 +484,7 @@ Helm with `--atomic`: In general, it is recommended to not use the `--atomic` se
 
 ### Breaking Changes
 
-- The `should_overwrite` flag in `csv-enricher` has been replaced with `write_semantics` to match the format used for other sources. See the [documentation](https://datahubproject.io/docs/generated/ingestion/sources/csv/) for more details
+- The `should_overwrite` flag in `csv-enricher` has been replaced with `write_semantics` to match the format used for other sources. See the [documentation](https://datahubproject.io/docs/generated/ingestion/sources/csv-enricher/) for more details
 - Closing an authorization hole in creating tags adding a Platform Privilege called `Create Tags` for creating tags. This is assigned to `datahub` root user, along
   with default All Users policy. Notice: You may need to add this privilege (or `Manage Tags`) to existing users that need the ability to create tags on the platform.
 - #5329 Below profiling config parameters are now supported in `BigQuery`:

diff --git a/docs/troubleshooting/quickstart.md b/docs/troubleshooting/quickstart.md
@@ -246,7 +246,7 @@ ALTER TABLE metadata_aspect_v2 CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_
 ## I've modified the default user.props file to include a custom username and password, but I don't see the new user(s) inside the Users & Groups tab. Why not?
 
 Currently, `user.props` is a file used by the JAAS PropertyFileLoginModule solely for the purpose of **Authentication**. The file is not used as an source from which to
-ingest additional metadata about the user. For that, you'll need to ingest some custom information about your new user using the Rest.li APIs or the [File-based ingestion source](../generated/ingestion/sources/file.md).
+ingest additional metadata about the user. For that, you'll need to ingest some custom information about your new user using the Rest.li APIs or the [Metadata File ingestion source](../generated/ingestion/sources/metadata-file.md).
 
 For an example of a file that ingests user information, check out [single_mce.json](https://github.com/datahub-project/datahub/blob/master/metadata-ingestion/examples/mce_files/single_mce.json), which ingests a single user object into DataHub. Notice that the "urn" field provided
 will need to align with the custom username you've provided in user.props file. For example, if your user.props file contains:

diff --git a/metadata-ingestion/cli-ingestion.md b/metadata-ingestion/cli-ingestion.md
@@ -58,7 +58,7 @@ Please refer the following pages for advanced guids on CLI ingestion.
 - [Reference for `datahub ingest` command](../docs/cli.md#ingest)
 - [UI Ingestion Guide](../docs/ui-ingestion.md)
 
-:::Tip Compatibility
+:::tip Compatibility
 DataHub server uses a 3 digit versioning scheme, while the CLI uses a 4 digit scheme. For example, if you're using DataHub server version 0.10.0, you should use CLI version 0.10.0.x, where x is a patch version.
 We do this because we do CLI releases at a much higher frequency than server releases, usually every few days vs twice a month.
 

diff --git a/.../docs/sources/csv/csv-enricher_recipe.yml → ...rces/csv-enricher/csv-enricher_recipe.yml b/.../docs/sources/csv/csv-enricher_recipe.yml → ...rces/csv-enricher/csv-enricher_recipe.yml
diff --git a/...gestion/docs/sources/file/file_recipe.yml → ...es/metadata-file/metadata-file_recipe.yml b/...gestion/docs/sources/file/file_recipe.yml → ...es/metadata-file/metadata-file_recipe.yml
diff --git a/metadata-ingestion/docs/sources/s3/README.md b/metadata-ingestion/docs/sources/s3/README.md
@@ -1,19 +1,11 @@
-This connector ingests S3 datasets into DataHub. It allows mapping an individual file or a folder of files to a dataset in DataHub. 
+This connector ingests AWS S3 datasets into DataHub. It allows mapping an individual file or a folder of files to a dataset in DataHub. 
 To specify the group of files that form a dataset, use `path_specs` configuration in ingestion recipe. Refer section [Path Specs](https://datahubproject.io/docs/generated/ingestion/sources/s3/#path-specs) for more details.
 
-### Concept Mapping
-
-This ingestion source maps the following Source System Concepts to DataHub Concepts:
-
-| Source Concept                           | DataHub Concept                                                                            | Notes               |
-| ---------------------------------------- |--------------------------------------------------------------------------------------------| ------------------- |
-| `"s3"`                                   | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataplatform/) |                     |
-| s3 object / Folder containing s3 objects | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/)            |                     |
-| s3 bucket                                | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/)        | Subtype `S3 bucket` |
-| s3 folder                                | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/)        | Subtype `Folder`    |
+:::tip
+This connector can also be used to ingest local files.
+Just replace `s3://` in your path_specs with an absolute path to files on the machine running ingestion.
+:::
 
-This connector supports both local files as well as those stored on AWS S3 (which must be identified using the prefix `s3://`). 
-[a]
 ### Supported file types
 Supported file types are as follows:
 
@@ -30,6 +22,16 @@ Schemas for schemaless formats (CSV, TSV, JSONL, JSON) are inferred. For CSV, TS
 JSON file schemas are inferred on the basis of the entire file (given the difficulty in extracting only the first few objects of the file), which may impact performance.
 We are working on using iterator-based JSON parsers to avoid reading in the entire JSON object.
 
+### Concept Mapping
+
+This ingestion source maps the following Source System Concepts to DataHub Concepts:
+
+| Source Concept                           | DataHub Concept                                                                            | Notes               |
+| ---------------------------------------- |--------------------------------------------------------------------------------------------| ------------------- |
+| `"s3"`                                   | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataplatform/) |                     |
+| s3 object / Folder containing s3 objects | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/)            |                     |
+| s3 bucket                                | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/)        | Subtype `S3 bucket` |
+| s3 folder                                | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/)        | Subtype `Folder`    |
 
 ### Profiling
 
@@ -42,4 +44,4 @@ This plugin extracts:
     - histograms or frequencies of unique values
 
 Note that because the profiling is run with PySpark, we require Spark 3.0.3 with Hadoop 3.2 to be installed (see [compatibility](#compatibility) for more details). If profiling, make sure that permissions for **s3a://** access are set because Spark and Hadoop use the s3a:// protocol to interface with AWS (schema inference outside of profiling requires s3:// access).
-Enabling profiling will slow down ingestion runs.
+Enabling profiling will slow down ingestion runs.
diff --git a/metadata-ingestion/docs/sources/s3/s3_recipe.yml b/metadata-ingestion/docs/sources/s3/s3_recipe.yml
@@ -1,9 +1,9 @@
+# Ingest data from S3
 source:
   type: s3
   config:
     path_specs:
-      -
-        include: "s3://covid19-lake/covid_knowledge_graph/csv/nodes/*.*"
+      - include: "s3://covid19-lake/covid_knowledge_graph/csv/nodes/*.*"
 
     aws_config:
       aws_access_key_id: *****
@@ -13,4 +13,9 @@ source:
     profiling:
       enabled: false
 
-# sink configs
+# Ingest data from local filesystem
+source:
+  type: s3
+  config:
+    path_specs:
+      - include: "/absolute/path/*.csv"
diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -259,7 +259,9 @@
 
 delta_lake = {
     *s3_base,
-    "deltalake>=0.6.3, != 0.6.4",
+    # Version 0.18.0 broken on ARM Macs: https://github.com/delta-io/delta-rs/issues/2577
+    "deltalake>=0.6.3, != 0.6.4, < 0.18.0; platform_system == 'Darwin' and platform_machine == 'arm64'",
+    "deltalake>=0.6.3, != 0.6.4; platform_system != 'Darwin' or platform_machine != 'arm64'",
 }
 
 powerbi_report_server = {"requests", "requests_ntlm"}

diff --git a/metadata-ingestion/sink_docs/file.md → ...data-ingestion/sink_docs/metadata-file.md b/metadata-ingestion/sink_docs/file.md → ...data-ingestion/sink_docs/metadata-file.md
@@ -1,4 +1,4 @@
-# File
+# Metadata File
 
 For context on getting started with ingestion, check out our [metadata ingestion guide](../README.md).
 
@@ -10,7 +10,7 @@ Works with `acryl-datahub` out of the box.
 
 Outputs metadata to a file. This can be used to decouple metadata sourcing from the
 process of pushing it into DataHub, and is particularly useful for debugging purposes.
-Note that the [file source](../../docs/generated/ingestion/sources/file.md) can read files generated by this sink.
+Note that the [file source](../../docs/generated/ingestion/sources/metadata-file.md) can read files generated by this sink.
 
 ## Quickstart recipe
 
@@ -35,4 +35,3 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
 | Field    | Required | Default | Description               |
 | -------- | -------- | ------- | ------------------------- |
 | filename | ✅       |         | Path to file to write to. |
-
diff --git a/metadata-ingestion/sink_overview.md b/metadata-ingestion/sink_overview.md
@@ -25,7 +25,7 @@ When configuring ingestion for DataHub, you're likely to be sending the metadata
 
 For debugging purposes or troubleshooting, the following sinks can be useful:
 
-- [File](sink_docs/file.md)
+- [Metadata File](sink_docs/metadata-file.md)
 - [Console](sink_docs/console.md)
 
 ## Default Sink

diff --git a/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py b/metadata-ingestion/src/datahub/ingestion/api/source_helpers.py
@@ -102,7 +102,6 @@ def auto_status_aspect(
     """
     all_urns: Set[str] = set()
     status_urns: Set[str] = set()
-    skip_urns: Set[str] = set()
     for wu in stream:
         urn = wu.get_urn()
         all_urns.add(urn)
@@ -127,14 +126,13 @@ def auto_status_aspect(
 
         yield wu
 
-    for urn in sorted(all_urns - status_urns - skip_urns):
+    for urn in sorted(all_urns - status_urns):
         entity_type = guess_entity_type(urn)
         if not entity_supports_aspect(entity_type, StatusClass):
             # If any entity does not support aspect 'status' then skip that entity from adding status aspect.
             # Example like dataProcessInstance doesn't suppport status aspect.
             # If not skipped gives error: java.lang.RuntimeException: Unknown aspect status for entity dataProcessInstance
             continue
-
         yield MetadataChangeProposalWrapper(
             entityUrn=urn,
             aspect=StatusClass(removed=False),

diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py
@@ -1,3 +1,4 @@
+import datetime
 import json
 import logging
 from collections import defaultdict
@@ -895,6 +896,12 @@ def gen_database_containers(
     ) -> Iterable[MetadataWorkUnit]:
         domain_urn = self._gen_domain_urn(database["Name"])
         database_container_key = self.gen_database_key(database["Name"])
+        parameters = database.get("Parameters", {})
+        if database.get("LocationUri") is not None:
+            parameters["LocationUri"] = database["LocationUri"]
+        if database.get("CreateTime") is not None:
+            create_time: datetime.datetime = database["CreateTime"]
+            parameters["CreateTime"] = create_time.strftime("%B %-d, %Y at %H:%M:%S")
         yield from gen_containers(
             container_key=database_container_key,
             name=database["Name"],
@@ -904,6 +911,7 @@ def gen_database_containers(
             qualified_name=self.get_glue_arn(
                 account_id=database["CatalogId"], database=database["Name"]
             ),
+            extra_properties=parameters,
         )
 
     def add_table_to_database_container(