From 38bcd9c38197eb77b4693980bea0d57ffe23bbfe Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Mon, 16 Sep 2024 23:11:58 -0700 Subject: [PATCH] feat(ingest): default to ASYNC_BATCH mode in datahub-rest sink (#11369) --- docs/how/updating-datahub.md | 1 + metadata-ingestion/sink_docs/datahub.md | 47 ++++---- .../src/datahub/emitter/rest_emitter.py | 36 +++++-- .../datahub/ingestion/sink/datahub_rest.py | 12 ++- .../datahub/utilities/partition_executor.py | 101 ++++++++++++++++-- .../tests/unit/test_pipeline.py | 4 +- .../unit/utilities/test_partition_executor.py | 8 ++ 7 files changed, 168 insertions(+), 41 deletions(-) diff --git a/docs/how/updating-datahub.md b/docs/how/updating-datahub.md index abb6bcd32a554f..d8a6e4c6bdca06 100644 --- a/docs/how/updating-datahub.md +++ b/docs/how/updating-datahub.md @@ -37,6 +37,7 @@ This file documents any backwards-incompatible changes in DataHub and assists pe Re-running with stateful ingestion should automatically clear up the entities with old URNS and add entities with new URNs, therefore not duplicating the containers or jobs. - #11313 - `datahub get` will no longer return a key aspect for entities that don't exist. +- #11369 - The default datahub-rest sink mode has been changed to `ASYNC_BATCH`. This requires a server with version 0.14.0+. ### Potential Downtime diff --git a/metadata-ingestion/sink_docs/datahub.md b/metadata-ingestion/sink_docs/datahub.md index 8ddcf6aff10352..a60356fc3789f7 100644 --- a/metadata-ingestion/sink_docs/datahub.md +++ b/metadata-ingestion/sink_docs/datahub.md @@ -29,6 +29,7 @@ sink: ``` If you are connecting to a hosted DataHub Cloud instance, your sink will look like + ```yml source: # source configs @@ -68,16 +69,17 @@ If you are using [UI based ingestion](../../docs/ui-ingestion.md) then where GMS Note that a `.` is used to denote nested fields in the YAML recipe. | Field | Required | Default | Description | -|----------------------------|----------|----------------------|----------------------------------------------------------------------------------------------------| -| `server` | ✅ | | URL of DataHub GMS endpoint. | +| -------------------------- | -------- | -------------------- | -------------------------------------------------------------------------------------------------- | +| `server` | ✅ | | URL of DataHub GMS endpoint. | +| `token` | | | Bearer token used for authentication. | | `timeout_sec` | | 30 | Per-HTTP request timeout. | | `retry_max_times` | | 1 | Maximum times to retry if HTTP request fails. The delay between retries is increased exponentially | | `retry_status_codes` | | [429, 502, 503, 504] | Retry HTTP request also on these status codes | -| `token` | | | Bearer token used for authentication. | | `extra_headers` | | | Extra headers which will be added to the request. | -| `max_threads` | | `15` | Experimental: Max parallelism for REST API calls | -| `ca_certificate_path` | | | Path to server's CA certificate for verification of HTTPS communications | -| `client_certificate_path` | | | Path to client's CA certificate for HTTPS communications | +| `max_threads` | | `15` | Max parallelism for REST API calls | +| `mode` | | `ASYNC_BATCH` | [Advanced] Mode of operation - `SYNC`, `ASYNC`, or `ASYNC_BATCH` | +| `ca_certificate_path` | | | Path to server's CA certificate for verification of HTTPS communications | +| `client_certificate_path` | | | Path to client's CA certificate for HTTPS communications | | `disable_ssl_verification` | | false | Disable ssl certificate validation | ## DataHub Kafka @@ -115,14 +117,14 @@ sink: Note that a `.` is used to denote nested fields in the YAML recipe. -| Field | Required | Default | Description | -| -------------------------------------------- | -------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `connection.bootstrap` | ✅ | | Kafka bootstrap URL. | -| `connection.producer_config.