Skip to content

Commit

Permalink
Merge pull request #60 from data-catering/datahub
Browse files Browse the repository at this point in the history
Add in support for datahub and confluent schema registry
  • Loading branch information
pflooky authored Aug 16, 2024
2 parents f431b34 + 5f3e92c commit 1abfcc8
Show file tree
Hide file tree
Showing 8 changed files with 423 additions and 48 deletions.
93 changes: 47 additions & 46 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,50 +120,51 @@ POSTGRES_USER=my-user POSTGRES_PASSWORD=my-password ./run.sh postgres

## Services

| Service Type | Service | Supported |
|-----------------------------|---------------|-----------|
| Change Data Capture | debezium ||
| Database | cassandra ||
| Database | cockroachdb ||
| Database | elasticsearch ||
| Database | mariadb ||
| Database | mongodb ||
| Database | mssql ||
| Database | mysql ||
| Database | neo4j ||
| Database | opensearch ||
| Database | postgres ||
| Database | spanner ||
| Database | sqlite ||
| Data Catalog | amundsen ||
| Data Catalog | marquez ||
| Data Catalog | polaris ||
| Data Catalog | unitycatalog ||
| Data Catalog | datahub ||
| Data Catalog | openmetadata ||
| Distributed Coordination | zookeeper ||
| Distributed Data Processing | flink ||
| HTTP | httpbin ||
| Identity Management | keycloak ||
| Job Orchestrator | airflow ||
| Job Orchestrator | dagster ||
| Job Orchestrator | mage-ai ||
| Job Orchestrator | prefect ||
| Messaging | activemq ||
| Messaging | kafka ||
| Messaging | rabbitmq ||
| Messaging | solace ||
| Notebook | jupyter ||
| Object Storage | minio ||
| Query Engine | duckdb ||
| Query Engine | flight-sql ||
| Query Engine | presto ||
| Query Engine | trino ||
| Real-time OLAP | clickhouse ||
| Real-time OLAP | doris ||
| Real-time OLAP | druid ||
| Real-time OLAP | pinot ||
| Test Data Management | data-caterer ||
| Workflow | maestro ||
| Workflow | temporal ||
| Service Type | Service | Supported |
|-----------------------------|---------------------------|-----------|
| Change Data Capture | debezium ||
| Database | cassandra ||
| Database | cockroachdb ||
| Database | elasticsearch ||
| Database | mariadb ||
| Database | mongodb ||
| Database | mssql ||
| Database | mysql ||
| Database | neo4j ||
| Database | opensearch ||
| Database | postgres ||
| Database | spanner ||
| Database | sqlite ||
| Data Catalog | amundsen ||
| Data Catalog | datahub ||
| Data Catalog | marquez ||
| Data Catalog | polaris ||
| Data Catalog | unitycatalog ||
| Data Catalog | openmetadata ||
| Distributed Coordination | zookeeper ||
| Distributed Data Processing | flink ||
| HTTP | httpbin ||
| Identity Management | keycloak ||
| Job Orchestrator | airflow ||
| Job Orchestrator | dagster ||
| Job Orchestrator | mage-ai ||
| Job Orchestrator | prefect ||
| Messaging | activemq ||
| Messaging | kafka ||
| Messaging | rabbitmq ||
| Messaging | solace ||
| Notebook | jupyter ||
| Object Storage | minio ||
| Query Engine | duckdb ||
| Query Engine | flight-sql ||
| Query Engine | presto ||
| Query Engine | trino ||
| Real-time OLAP | clickhouse ||
| Real-time OLAP | doris ||
| Real-time OLAP | druid ||
| Real-time OLAP | pinot ||
| Schema Registry | confluent-schema-registry ||
| Test Data Management | data-caterer ||
| Workflow | maestro ||
| Workflow | temporal ||

11 changes: 11 additions & 0 deletions data/confluent-schema-registry/env/docker.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
SCHEMA_REGISTRY_HOST_NAME=schema-registry
SCHEMA_REGISTRY_KAFKASTORE_SECURITY_PROTOCOL=PLAINTEXT
SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS=kafka:29092

# Uncomment to customize the Schema Registry kafka store connection
# ZOOKEEPER_SASL_ENABLED=false
# KAFKA_OPTS=-Xms1g -Xmx1g
# SCHEMA_REGISTRY_JMX_OPTS=-Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false

# Uncomment to use schema registry < v5.4.0
# SCHEMA_REGISTRY_KAFKASTORE_CONNECTION_URL=zookeeper:2181
38 changes: 38 additions & 0 deletions data/datahub-actions/env/docker.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
DATAHUB_GMS_PROTOCOL=http
DATAHUB_GMS_HOST=datahub-gms
DATAHUB_GMS_PORT=8080

KAFKA_BOOTSTRAP_SERVER=broker:29092
SCHEMA_REGISTRY_URL=http://schema-registry:8081
# SCHEMA_REGISTRY_URL=http://datahub-gms:8080/schema-registry/api/
METADATA_AUDIT_EVENT_NAME=MetadataAuditEvent_v4
METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME=MetadataChangeLog_Versioned_v1

# System Auth --
DATAHUB_SYSTEM_CLIENT_ID=__datahub_system
DATAHUB_SYSTEM_CLIENT_SECRET=JohnSnowKnowsNothing

# Kafka Authentication
KAFKA_PROPERTIES_SECURITY_PROTOCOL=PLAINTEXT

# Uncomment the following if your Kafka deployment requires SSL.
# KAFKA_PROPERTIES_SSL_KEYSTORE_LOCATION=/mnt/certs/keystore
# KAFKA_PROPERTIES_SSL_TRUSTSTORE_LOCATION=/mnt/certs/truststore
# KAFKA_PROPERTIES_SSL_KEYSTORE_PASSWORD=keystore_password
# KAFKA_PROPERTIES_SSL_KEY_PASSWORD=keystore_password
# KAFKA_PROPERTIES_SSL_TRUSTSTORE_PASSWORD=truststore_password

# The following env vars are meant to be passed through from the Host System
# to configure the Slack and Teams Actions
# _ENABLED flags need to be set to "true" case sensitive for the action to be enabled
DATAHUB_ACTIONS_SLACK_ENABLED
DATAHUB_ACTIONS_SLACK_DATAHUB_BASE_URL
DATAHUB_ACTIONS_SLACK_BOT_TOKEN
DATAHUB_ACTIONS_SLACK_SIGNING_SECRET
DATAHUB_ACTIONS_SLACK_CHANNEL
DATAHUB_ACTIONS_SLACK_SUPPRESS_SYSTEM_ACTIVITY

DATAHUB_ACTIONS_TEAMS_ENABLED
DATAHUB_ACTIONS_TEAMS_DATAHUB_BASE_URL
DATAHUB_ACTIONS_TEAMS_WEBHOOK_URL
DATAHUB_ACTIONS_TEAMS_SUPPRESS_SYSTEM_ACTIVITY
71 changes: 71 additions & 0 deletions data/datahub-frontend/env/docker.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
DATAHUB_GMS_HOST=datahub-gms
DATAHUB_GMS_PORT=8080
DATAHUB_SECRET=YouKnowNothing
DATAHUB_APP_VERSION=1.0
DATAHUB_PLAY_MEM_BUFFER_SIZE=10MB
JAVA_OPTS=-Xms512m -Xmx512m -Dhttp.port=9002 -Dconfig.file=datahub-frontend/conf/application.conf -Djava.security.auth.login.config=datahub-frontend/conf/jaas.conf -Dlogback.configurationFile=datahub-frontend/conf/logback.xml -Dlogback.debug=false -Dpidfile.path=/dev/null

# Uncomment and set these to support SSL connection to GMS
# NOTE: Currently GMS itself does not offer SSL support, these settings are intended for when there is a proxy in front
# of GMS that handles SSL, such as an EC2 Load Balancer.
#DATAHUB_GMS_USE_SSL=true
#DATAHUB_GMS_SSL_PROTOCOL=

# Uncomment and set custom SSL truststore settings
# SSL_TRUSTSTORE_FILE=datahub-frontend/conf/truststore.jks
# SSL_TRUSTSTORE_TYPE=jks
# SSL_TRUSTSTORE_PASSWORD=MyTruststorePassword

# Uncomment to enable Metadata Service Authentication
# METADATA_SERVICE_AUTH_ENABLED=true

# Uncomment & populate these configs to enable OIDC SSO in React application.
# Required OIDC configs
# AUTH_OIDC_ENABLED=true
# AUTH_OIDC_CLIENT_ID=1030786188615-rr9ics9gl8n4acngj9opqbf2mruflqpr.apps.googleusercontent.com
# AUTH_OIDC_CLIENT_SECRET=acEdaGcnfd7KxvsXRFDD7FNF
# AUTH_OIDC_DISCOVERY_URI=https://accounts.google.com/.well-known/openid-configuration
# AUTH_OIDC_BASE_URL=http://localhost:9001
# Optional OIDC configs
# AUTH_OIDC_USER_NAME_CLAIM=email
# AUTH_OIDC_USER_NAME_CLAIM_REGEX=([^@]+)
# AUTH_OIDC_SCOPE=
# Optional Provisioning Configs
# AUTH_OIDC_JIT_PROVISIONING_ENABLED=true
# AUTH_OIDC_PRE_PROVISIONING_REQUIRED=false
# AUTH_OIDC_EXTRACT_GROUPS_ENABLED=false
# AUTH_OIDC_GROUPS_CLAIM=groups

# Uncomment to disable JAAS username / password authentication (enabled by default)
# AUTH_JAAS_ENABLED=false

# Uncomment to disable persistence of client-side analytics events
# DATAHUB_ANALYTICS_ENABLED=false

# Required Kafka Producer Configs
KAFKA_BOOTSTRAP_SERVER=kafka:29092
DATAHUB_TRACKING_TOPIC=DataHubUsageEvent_v1

# Required Elastic Client Configuration (Analytics)
ELASTIC_CLIENT_HOST=elasticsearch
ELASTIC_CLIENT_PORT=9200

# Optional Elastic Client Configurations
# ELASTIC_CLIENT_THREAD_COUNT=2
# ELASTIC_CLIENT_CONNECTION_REQUEST_TIMEOUT=50

# To support SSL connections to Elastic, uncomment and set the following
# ELASTIC_CLIENT_USE_SSL=true
# ELASTIC_CLIENT_SSL_PROTOCOL=TLSv1.2
# ELASTIC_CLIENT_SSL_SECURE_RANDOM_IMPLEMENTATION=
# ELASTIC_CLIENT_SSL_TRUST_STORE_FILE=
# ELASTIC_CLIENT_SSL_TRUST_STORE_TYPE=
# ELASTIC_CLIENT_SSL_TRUST_STORE_PASSWORD=
# ELASTIC_CLIENT_SSL_KEY_STORE_FILE=
# ELASTIC_CLIENT_SSL_KEY_STORE_TYPE=
# ELASTIC_CLIENT_SSL_KEY_STORE_PASSWORD=

# To use simple username/password authentication to Elasticsearch over HTTPS
# set ELASTIC_CLIENT_USE_SSL=true and uncomment:
# ELASTIC_CLIENT_USERNAME=
# ELASTIC_CLIENT_PASSWORD=
55 changes: 55 additions & 0 deletions data/datahub-gms/env/docker.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-gms
EBEAN_DATASOURCE_USERNAME=datahub
EBEAN_DATASOURCE_PASSWORD=datahub
EBEAN_DATASOURCE_HOST=mysql:3306
EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8
EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
KAFKA_BOOTSTRAP_SERVER=kafka:29092
KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
# KAFKA_SCHEMAREGISTRY_URL=http://datahub-gms:8080/schema-registry/api/
ELASTICSEARCH_HOST=elasticsearch
ELASTICSEARCH_PORT=9200
ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
ES_BULK_REFRESH_POLICY=WAIT_UNTIL
GRAPH_SERVICE_DIFF_MODE_ENABLED=true
GRAPH_SERVICE_IMPL=elasticsearch
JAVA_OPTS=-Xms1g -Xmx1g
ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml

MAE_CONSUMER_ENABLED=true
MCE_CONSUMER_ENABLED=true
PE_CONSUMER_ENABLED=true
UI_INGESTION_ENABLED=true
ENTITY_SERVICE_ENABLE_RETENTION=true

ELASTIC_ID_HASH_ALGO=MD5

# Uncomment to disable persistence of client-side analytics events
# DATAHUB_ANALYTICS_ENABLED=false

# Uncomment to configure kafka topic names
# Make sure these names are consistent across the whole deployment
# METADATA_AUDIT_EVENT_NAME=MetadataAuditEvent_v4
# METADATA_CHANGE_EVENT_NAME=MetadataChangeEvent_v4
# FAILED_METADATA_CHANGE_EVENT_NAME=FailedMetadataChangeEvent_v4

# Uncomment and set these to support SSL connection to Elasticsearch
# ELASTICSEARCH_USE_SSL=true
# ELASTICSEARCH_SSL_PROTOCOL=TLSv1.2
# ELASTICSEARCH_SSL_SECURE_RANDOM_IMPL=
# ELASTICSEARCH_SSL_TRUSTSTORE_FILE=
# ELASTICSEARCH_SSL_TRUSTSTORE_TYPE=
# ELASTICSEARCH_SSL_TRUSTSTORE_PASSWORD=
# ELASTICSEARCH_SSL_KEYSTORE_FILE=
# ELASTICSEARCH_SSL_KEYSTORE_TYPE=
# ELASTICSEARCH_SSL_KEYSTORE_PASSWORD=

# To use simple username/password authentication to Elasticsearch over HTTPS
# set ELASTICSEARCH_USE_SSL=true and uncomment:
# ELASTICSEARCH_USERNAME=
# ELASTICSEARCH_PASSWORD=

# Uncomment to run a one-time upgrade to migrate legacy default browse path format to latest format
# More details can be found at https://datahubproject.io/docs/advanced/browse-paths-upgrade
# UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED=true
55 changes: 55 additions & 0 deletions data/datahub-upgrade/env/docker-without-neo4j.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
DATAHUB_UPGRADE_HISTORY_KAFKA_CONSUMER_GROUP_ID=generic-duhe-consumer-job-client-gms
EBEAN_DATASOURCE_USERNAME=datahub
EBEAN_DATASOURCE_PASSWORD=datahub
EBEAN_DATASOURCE_HOST=mysql:3306
EBEAN_DATASOURCE_URL=jdbc:mysql://mysql:3306/datahub?verifyServerCertificate=false&useSSL=true&useUnicode=yes&characterEncoding=UTF-8
EBEAN_DATASOURCE_DRIVER=com.mysql.jdbc.Driver
KAFKA_BOOTSTRAP_SERVER=kafka:29092
KAFKA_SCHEMAREGISTRY_URL=http://schema-registry:8081
# KAFKA_SCHEMAREGISTRY_URL=http://datahub-gms:8080/schema-registry/api/
ELASTICSEARCH_HOST=elasticsearch
ELASTICSEARCH_PORT=9200
ELASTICSEARCH_INDEX_BUILDER_SETTINGS_REINDEX=true
ELASTICSEARCH_INDEX_BUILDER_MAPPINGS_REINDEX=true
ES_BULK_REFRESH_POLICY=WAIT_UNTIL
GRAPH_SERVICE_DIFF_MODE_ENABLED=true
GRAPH_SERVICE_IMPL=elasticsearch
JAVA_OPTS=-Xms1g -Xmx1g
ENTITY_REGISTRY_CONFIG_PATH=/datahub/datahub-gms/resources/entity-registry.yml

MAE_CONSUMER_ENABLED=true
MCE_CONSUMER_ENABLED=true
PE_CONSUMER_ENABLED=true
UI_INGESTION_ENABLED=true
ENTITY_SERVICE_ENABLE_RETENTION=true

ELASTIC_ID_HASH_ALGO=MD5

# Uncomment to disable persistence of client-side analytics events
# DATAHUB_ANALYTICS_ENABLED=false

# Uncomment to configure kafka topic names
# Make sure these names are consistent across the whole deployment
# METADATA_AUDIT_EVENT_NAME=MetadataAuditEvent_v4
# METADATA_CHANGE_EVENT_NAME=MetadataChangeEvent_v4
# FAILED_METADATA_CHANGE_EVENT_NAME=FailedMetadataChangeEvent_v4

# Uncomment and set these to support SSL connection to Elasticsearch
# ELASTICSEARCH_USE_SSL=true
# ELASTICSEARCH_SSL_PROTOCOL=TLSv1.2
# ELASTICSEARCH_SSL_SECURE_RANDOM_IMPL=
# ELASTICSEARCH_SSL_TRUSTSTORE_FILE=
# ELASTICSEARCH_SSL_TRUSTSTORE_TYPE=
# ELASTICSEARCH_SSL_TRUSTSTORE_PASSWORD=
# ELASTICSEARCH_SSL_KEYSTORE_FILE=
# ELASTICSEARCH_SSL_KEYSTORE_TYPE=
# ELASTICSEARCH_SSL_KEYSTORE_PASSWORD=

# To use simple username/password authentication to Elasticsearch over HTTPS
# set ELASTICSEARCH_USE_SSL=true and uncomment:
# ELASTICSEARCH_USERNAME=
# ELASTICSEARCH_PASSWORD=

# Uncomment to run a one-time upgrade to migrate legacy default browse path format to latest format
# More details can be found at https://datahubproject.io/docs/advanced/browse-paths-upgrade
# UPGRADE_DEFAULT_BROWSE_PATHS_ENABLED=true
43 changes: 43 additions & 0 deletions data/mysql/data/datahub.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
-- create datahub database
CREATE DATABASE IF NOT EXISTS `datahub` CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;
USE `datahub`;

-- create metadata aspect table
create table if not exists metadata_aspect_v2
(
urn varchar(500) not null,
aspect varchar(200) not null,
version bigint(20) not null,
metadata longtext not null,
systemmetadata longtext,
createdon datetime(6) not null,
createdby varchar(255) not null,
createdfor varchar(255),
constraint pk_metadata_aspect_v2 primary key (urn, aspect, version),
INDEX timeIndex(createdon)
) CHARACTER SET utf8mb4 COLLATE utf8mb4_bin;

-- create default records for datahub user if not exists
DROP TABLE if exists temp_metadata_aspect_v2;
CREATE TABLE temp_metadata_aspect_v2 LIKE metadata_aspect_v2;
INSERT INTO temp_metadata_aspect_v2 (urn, aspect, version, metadata, createdon, createdby)
VALUES ('urn:li:corpuser:datahub',
'corpUserInfo',
0,
'{"displayName":"Data Hub","active":true,"fullName":"Data Hub","email":"[email protected]"}',
now(),
'urn:li:corpuser:__datahub_system'),
('urn:li:corpuser:datahub',
'corpUserEditableInfo',
0,
'{"skills":[],"teams":[],"pictureLink":"https://raw.githubusercontent.com/datahub-project/datahub/master/datahub-web-react/src/images/default_avatar.png"}',
now(),
'urn:li:corpuser:__datahub_system');
-- only add default records if metadata_aspect is empty
INSERT INTO metadata_aspect_v2
SELECT *
FROM temp_metadata_aspect_v2
WHERE NOT EXISTS (SELECT * from metadata_aspect_v2);
DROP TABLE temp_metadata_aspect_v2;

DROP TABLE IF EXISTS metadata_index;
Loading

0 comments on commit 1abfcc8

Please sign in to comment.