Skip to content

Commit

Permalink
Adds documentation for connecting the OpenMetadata platform to the dw…
Browse files Browse the repository at this point in the history
…h and ingesting dwh asset metadata.
  • Loading branch information
MattTriano committed Nov 8, 2024
1 parent b5bf405 commit d64a3c9
Show file tree
Hide file tree
Showing 28 changed files with 172 additions and 49 deletions.
11 changes: 9 additions & 2 deletions .github/workflows/build-and-deploy-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,17 @@ jobs:
build-and-deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: 3.x
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
- uses: actions/cache@v4
with:
key: mkdocs-material-${{ env.cache_id }}
path: .cache
restore-keys: |
mkdocs-material-
- run: pip install \
mkdocs-material \
mkdocs-glightbox
Expand Down
22 changes: 21 additions & 1 deletion .startup/env_var_defaults.json
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,6 @@
"DWH_POSTGRES_USER": ".env.om_db::POSTGRES_DB"
}
},

".env.om_server::SERVER_PORT": {
"file": ".env.om_server",
"name": "SERVER_PORT",
Expand Down Expand Up @@ -482,6 +481,27 @@
"user_input": false,
"dependant_on_other_env_vars": false
},
".env.om_server::AUTHORIZER_ADMIN_PRINCIPALS": {
"file": ".env.om_server",
"name": "AUTHORIZER_ADMIN_PRINCIPALS",
"group": "Open Metadata Server",
"default_value": "admin",
"user_input": true,
"valid_pattern": null,
"invalid_substrings": "[\\s]",
"is_list": true,
"dependant_on_other_env_vars": false
},
".env.om_server::AUTHORIZER_PRINCIPAL_DOMAIN": {
"file": ".env.om_server",
"name": "AUTHORIZER_PRINCIPAL_DOMAIN",
"group": "Open Metadata Server",
"default_value": "open-metadata.org",
"user_input": true,
"valid_pattern": null,
"invalid_substrings": "[\\s]",
"dependant_on_other_env_vars": false
},
".env.om_server::AUTHORIZER_CLASS_NAME": {
"file": ".env.om_server",
"name": "AUTHORIZER_CLASS_NAME",
Expand Down
79 changes: 43 additions & 36 deletions .startup/make_env_docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pathlib import Path
import re
import subprocess
from typing import Dict, List, Optional
from typing import Optional
import uuid
import urllib

Expand Down Expand Up @@ -36,7 +36,7 @@ def dot_env_file_already_exists(startup_dir: Path, file_name: str = ".env") -> b
return False


def load_env_var_defaults_file(startup_dir: Path) -> Dict:
def load_env_var_defaults_file(startup_dir: Path) -> dict:
default_env_vars_file_path = Path(startup_dir).joinpath("env_var_defaults.json")
with open(default_env_vars_file_path, "r") as jf:
default_env_vars_json = json.load(jf)
Expand All @@ -47,7 +47,8 @@ def get_and_validate_user_input(
env_var: str,
default_value: str,
valid_input_pattern: Optional[str] = None,
invalid_substrings: List[str] = [" ", "\n", "\t"],
invalid_substrings: list[str] = [" ", "\n", "\t"],
is_list: bool = False,
max_tries: int = MAX_TRIES,
) -> str:
msg = f"{env_var} [leave blank for default value: '{default_value}']: "
Expand All @@ -56,8 +57,8 @@ def get_and_validate_user_input(
while tries_remaining > 0:
input_val = input(msg)
if input_val == "":
return default_value

output_value = default_value
break
if isinstance(invalid_substrings, str) and (invalid_substrings in input_val):
print(f"Invalid value entered, can't contain this substring: {invalid_substrings}")
tries_remaining = tries_remaining - 1
Expand All @@ -75,24 +76,28 @@ def get_and_validate_user_input(
continue
elif valid_input_pattern is not None:
if re.match(valid_input_pattern, input_val):
return input_val
output_value = input_val
break
else:
print(f"Invalid value entered, must match pattern {valid_input_pattern}")
tries_remaining = tries_remaining - 1
continue
return input_val
if is_list:
output_value = f"[{','.join([el.strip() for el in output_value.split()])}]"
return output_value
except KeyboardInterrupt:
print("Keyboard interrupted")


def orchestrate_user_input_prompts(env_var_dict: Dict) -> Dict:
def orchestrate_user_input_prompts(env_var_dict: dict) -> dict:
for env_var_id, env_var_payload in env_var_dict.items():
if env_var_payload["user_input"] == True:
env_var_dict[env_var_id]["set_value"] = get_and_validate_user_input(
env_var=env_var_payload["name"],
default_value=env_var_payload["default_value"],
valid_input_pattern=env_var_payload["valid_pattern"],
invalid_substrings=env_var_payload["invalid_substrings"],
is_list=env_var_payload.get("is_list", False),
)
elif env_var_payload["dependant_on_other_env_vars"] == True:
env_var_mapper = env_var_payload["env_var_mappings"]
Expand All @@ -108,23 +113,23 @@ def orchestrate_user_input_prompts(env_var_dict: Dict) -> Dict:
return env_var_dict


def get_env_var_payloads(env_var_dict: Dict) -> List:
def get_env_var_payloads(env_var_dict: dict) -> list:
env_var_payloads = [v for k, v in env_var_dict.items()]
return env_var_payloads


def get_distinct_dot_env_file_names(env_var_payloads: List) -> List:
def get_distinct_dot_env_file_names(env_var_payloads: list) -> list:
return list(set([p["file"] for p in env_var_payloads]))


def create_dot_env_files(output_dir: Path, env_var_dict: Dict) -> None:
def create_dot_env_files(output_dir: Path, env_var_dict: dict) -> None:
all_lines_all_files = prepare_dot_env_file_lines(output_dir, env_var_dict)
for file_name, lines in all_lines_all_files.items():
with open(file_name, "x") as f:
f.write(lines)


def prepare_dot_env_file_lines(output_dir: Path, env_var_dict: Dict) -> None:
def prepare_dot_env_file_lines(output_dir: Path, env_var_dict: dict) -> None:
env_var_payloads = get_env_var_payloads(env_var_dict=env_var_dict)
dot_env_file_names = get_distinct_dot_env_file_names(env_var_payloads=env_var_payloads)
all_lines_all_files = {}
Expand All @@ -147,28 +152,13 @@ def prepare_dot_env_file_lines(output_dir: Path, env_var_dict: Dict) -> None:
return all_lines_all_files


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--startup_dir", default=".", help="The project's top-level directory")
parser.add_argument(
"--mode",
default="interactive",
help="Credential-defining process: options: ['interactive', 'dev']",
)
args = parser.parse_args()

startup_dir = Path(args.startup_dir)
if args.mode == "dev":
output_dir = startup_dir.joinpath(".dev")
output_dir.mkdir(exist_ok=True)
else:
output_dir = startup_dir
dot_env_exists = dot_env_file_already_exists(startup_dir=output_dir, file_name=".env")
dwh_dot_env_exists = dot_env_file_already_exists(startup_dir=output_dir, file_name=".env.dwh")
superset_dot_env_exists = dot_env_file_already_exists(
startup_dir=output_dir, file_name=".env.superset"
)
if dot_env_exists or dwh_dot_env_exists or superset_dot_env_exists:
def main(output_dir: Path) -> None:
env_exists = dot_env_file_already_exists(output_dir, file_name=".env")
dwh_exists = dot_env_file_already_exists(output_dir, file_name=".env.dwh")
ss_exists = dot_env_file_already_exists(output_dir, file_name=".env.superset")
om_db_exists = dot_env_file_already_exists(output_dir, file_name=".env.om_db")
om_server_exists = dot_env_file_already_exists(output_dir, file_name=".env.om_server")
if env_exists or dwh_exists or ss_exists or om_db_exists or om_server_exists:
raise Exception(
f"One or more dot-env file(s) would be overwritten. Backup and move .env files and "
+ "try again"
Expand Down Expand Up @@ -225,6 +215,23 @@ def prepare_dot_env_file_lines(output_dir: Path, env_var_dict: Dict) -> None:
"group": "Open Metadata Server",
"set_value": str(uuid.uuid4()),
}
# file_lines = prepare_dot_env_file_lines(output_dir=output_dir, env_var_dict=env_var_dict)
# print(file_lines)
create_dot_env_files(output_dir=output_dir, env_var_dict=env_var_dict)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--startup_dir", default=".", help="The project's top-level directory")
parser.add_argument(
"--mode",
default="interactive",
help="Credential-defining process: options: ['interactive', 'dev']",
)
args = parser.parse_args()

startup_dir = Path(args.startup_dir)
if args.mode == "dev":
output_dir = startup_dir.joinpath(".dev")
output_dir.mkdir(exist_ok=True)
else:
output_dir = startup_dir
main(output_dir)
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
10 changes: 5 additions & 5 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ At present, it uses docker to provision and run:
* Dashboarding and Reporting

![Geospatial Data Analysis](/assets/imgs/superset/deckgl_polygon_chart_demo.png)
![Time Series Analysis](/assets/imgs/superset/median_sale_price_by_property_class.png)
![Time Series Analysis](/assets/imgs/superset/median_sale_price_by_property_class.png)
![Dashboarding](/assets/imgs/superset/dashboard_demo.png)

* a **pgAdmin4** database administration interface,
Expand All @@ -33,11 +33,11 @@ At present, it uses docker to provision and run:
* **dbt** to:
* manage sequential data transformation + cleaning tasks,
* serve data documentation and data lineage graphs, and
* facilitate search of the data dictionary and data catalog
* facilitate search of the data dictionary and data catalog

![dbt Data Lineage Graph](/assets/imgs/systems/dbt_lineage_graph_of_parcel_sales.png)
![All Data Tables' Lineage Graphs](/assets/imgs/dbt/lineage_graph_of_all_nodes.png)
![One Data Set's Lineage Graph](/assets/imgs/systems/dbt_data_docs_interface_showing_parcel_sales.png)
![One Data Set's Lineage Graph](/assets/imgs/systems/dbt_data_docs_interface_showing_parcel_sales.png)

* great_expectations for anomaly detection and data monitoring, and

Expand All @@ -47,6 +47,6 @@ At present, it uses docker to provision and run:

![data-loading TaskGroups in load_data_tg TaskGroup](/assets/imgs/Socrata_ELT_DAG/Full_view_data_loaders_in_load_data_tg.PNG)

![load_data_tg TaskGroup High Level](/assets/imgs/Socrata_ELT_DAG/load_data_task_group_w_checkpoints.png)
![load_data_tg TaskGroup High Level](/assets/imgs/Socrata_ELT_DAG/load_data_task_group_w_checkpoints.png)

![automate as much pipeline development as possible](/assets/imgs/Socrata_ELT_DAG/generate_and_run_dbt_models.png)
3 changes: 2 additions & 1 deletion docs/setup/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,5 @@ Work through these steps to set up all ADWH credentials and services. It should

1. [Set up your credentials and build the images](/setup/getting_started)
2. [Configure database connections for Superset](/setup/superset_setup)
3. [Configure database connections for pgAdmin4](/setup/pgAdmin4)
3. [Configure database connections for pgAdmin4](/setup/pgAdmin4)
4. [Configure database connections and ingestions for OpenMetadata](/setup/openMetadata)
88 changes: 88 additions & 0 deletions docs/setup/openMetadata.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Setting up OpenMetadata

## Logging into the OpenMetadata UI
To access the OpenMetadata UI, go to [http://localhost:8585](http://localhost:8585) and log in using credentials defined in [the initial setup step](/setup/getting_setup).

* **Email:** This will consist of two environment variables from `.env.om_server` in the form below. The default email is `[email protected]`.

`AUTHORIZER_ADMIN_PRINCIPALS@AUTHORIZER_PRINCIPAL_DOMAIN`

* **Password:** the default password will be `admin`. You should probably [change that password](#changing-your-password) on your first login.

<center>![](/assets/imgs/openmetadata/om_setup__initial_login.png)</center>

!!! note

If you're hosting the ADWH system on another machine, replace `localhost` with the domain name or IP addess of that remote machine.


## Define a Connection to the Data Warehouse Database

1. Go to **Settings** > **Services** > **Databases** and **Add a New Service**

<center>![](/assets/imgs/openmetadata/om_setup__make_conn_1.png)</center>
<center>![](/assets/imgs/openmetadata/om_setup__make_conn_2.png)</center>
<center>![](/assets/imgs/openmetadata/om_setup__make_conn_3.png)</center>
<center>![](/assets/imgs/openmetadata/om_setup__make_conn_4.png)</center>

2. Make a Postgres Database Service

<center>![](/assets/imgs/openmetadata/om_setup__make_conn_5.png)</center>

2.a. Enter a name and a brief description of the database.

<center>![](/assets/imgs/openmetadata/om_setup__make_conn_6.png)</center>

2.b. Enter credentials for the DWH database role you want the service to use along with the other connection info. After entering credentials and other info, **Test your Connection**.

<center>![](/assets/imgs/openmetadata/om_setup__make_conn_7.png)</center>
<center>![](/assets/imgs/openmetadata/om_setup__make_conn_8.png)</center>

If all connection checks pass, click **OK** and **Save** the connection.

If everything was successful, you should now see your `ADWH Data Warehouse` **Database Service**.

<center>![](/assets/imgs/openmetadata/om_setup__make_conn_9.png)</center>

## Configure Metadata Ingestion

Contuing from the last image, go to the page for the `ADWH Data Warehouse` **Database Service**. You will see the different postgres databases in the `dwh_db` postgres instance.

To scan those databases for **Data Assets** to catalog, you have to configure an **Ingestion**.

1. Click **Ingestions** > **Add Ingestion** > **Add Metadata Ingestion**

<center>![](/assets/imgs/openmetadata/om_setup__make_ingestion_1.png)</center>

<center>![](/assets/imgs/openmetadata/om_setup__make_ingestion_2.png)</center>

2. Specify the assets to include in this **Metadata Ingestion** configuration

Enter regex patterns to specify which database(s), schema(s), and table(s) should be included.

<center>![](/assets/imgs/openmetadata/om_setup__make_ingestion_3.png)</center>

<center>![](/assets/imgs/openmetadata/om_setup__make_ingestion_4.png)</center>

3. Set the schedule for running this **Ingestion** then **Add & Deploy** it

<center>![](/assets/imgs/openmetadata/om_setup__make_ingestion_5.png)</center>
<center>![](/assets/imgs/openmetadata/om_setup__make_ingestion_6.png)</center>

4. Trigger an initial **Run** to immediately run the **Metadata Ingestion**

<center>![](/assets/imgs/openmetadata/om_setup__make_ingestion_7.png)</center>

Now you should have a catalog of metadata for all **Data Assets** in the main DWH schemas.

### Changing your password

1. From the user profile dropdown in the upper right corner, click your username to addess your user page

![](/assets/imgs/openmetadata/om_setup__pw_change_1.png)

2. Change your password

![](/assets/imgs/openmetadata/om_setup__pw_change_2.png)

<center>![](/assets/imgs/openmetadata/om_setup__pw_change_3.png)</center>
4 changes: 2 additions & 2 deletions docs/setup/pgAdmin4.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ Don't worry about the **Server group** field, the default is fine.

* **Host name/address:** `airflow_db`
* This is defined [here](https://github.com/MattTriano/analytics_data_where_house/blob/c75869ba6fae5c033e6601b9203fd178148f2777/docker-compose.yml#L34) in the `docker-compose.yml` file
* **Port:** 5432
* **Port:** 5432
* This is the database's port number inside the container, as defined to the right of the colon [here](https://github.com/MattTriano/analytics_data_where_house/blob/c75869ba6fae5c033e6601b9203fd178148f2777/docker-compose.yml#L44).
* **Username:** the `POSTGRES_USER` value in your `.env` file.
* **Password:** the `POSTGRES_PASSWORD` value in your `.env` file.
Expand All @@ -66,7 +66,7 @@ Repeat the process to connect to the data warehouse database.

* **Host name/address:** `dwh_db`
* This is defined [here](https://github.com/MattTriano/analytics_data_where_house/blob/c75869ba6fae5c033e6601b9203fd178148f2777/docker-compose.yml#L61) in the `docker-compose.yml` file
* **Port:** 5432
* **Port:** 5432
* This is the database's port number inside the container, as defined to the right of the colon [here](https://github.com/MattTriano/analytics_data_where_house/blob/c75869ba6fae5c033e6601b9203fd178148f2777/docker-compose.yml#L71)
* **Username:** the `DWH_POSTGRES_USER` value in your `.env` file
* **Password:** the `DWH_POSTGRES_PASSWORD` value in your `.env` file
Expand Down
4 changes: 2 additions & 2 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ markdown_extensions:
- pymdownx.inlinehilite
- pymdownx.snippets
- pymdownx.emoji:
emoji_index: !!python/name:materialx.emoji.twemoji
emoji_generator: !!python/name:materialx.emoji.to_svg
emoji_index: !!python/name:material.extensions.emoji.twemoji
emoji_generator: !!python/name:material.extensions.emoji.to_svg
- pymdownx.superfences:
custom_fences:
- name: mermaid
Expand Down

0 comments on commit d64a3c9

Please sign in to comment.