diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index be9cb2642..146e07743 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,7 @@ repos: types: [text] args: - --exclude-file=tests/sample/manifest_model_version.json - - --skip=**/manifest.json + - --skip=**/manifest.json,**.min.js - -L connexion,aci - repo: https://github.com/pre-commit/pygrep-hooks rev: v1.10.0 diff --git a/cosmos/plugin/__init__.py b/cosmos/plugin/__init__.py new file mode 100644 index 000000000..48061b254 --- /dev/null +++ b/cosmos/plugin/__init__.py @@ -0,0 +1,202 @@ +import os.path as op +from typing import Any, Dict, Optional, Tuple +from urllib.parse import urlsplit + +from airflow.configuration import conf +from airflow.plugins_manager import AirflowPlugin +from airflow.security import permissions +from airflow.www.auth import has_access +from airflow.www.views import AirflowBaseView +from flask import abort, url_for +from flask_appbuilder import AppBuilder, expose + + +def bucket_and_key(path: str) -> Tuple[str, str]: + parsed_url = urlsplit(path) + bucket = parsed_url.netloc + key = parsed_url.path.lstrip("/") + return bucket, key + + +def open_s3_file(conn_id: Optional[str], path: str) -> str: + from airflow.providers.amazon.aws.hooks.s3 import S3Hook + + if conn_id is None: + conn_id = S3Hook.default_conn_name + + hook = S3Hook(aws_conn_id=conn_id) + bucket, key = bucket_and_key(path) + content = hook.read_key(key=key, bucket_name=bucket) + return content # type: ignore[no-any-return] + + +def open_gcs_file(conn_id: Optional[str], path: str) -> str: + from airflow.providers.google.cloud.hooks.gcs import GCSHook + + if conn_id is None: + conn_id = GCSHook.default_conn_name + + hook = GCSHook(gcp_conn_id=conn_id) + bucket, blob = bucket_and_key(path) + content = hook.download(bucket_name=bucket, object_name=blob) + return content.decode("utf-8") # type: ignore[no-any-return] + + +def open_azure_file(conn_id: Optional[str], path: str) -> str: + from airflow.providers.microsoft.azure.hooks.wasb import WasbHook + + if conn_id is None: + conn_id = WasbHook.default_conn_name + + hook = WasbHook(wasb_conn_id=conn_id) + + container, blob = bucket_and_key(path) + content = hook.read_file(container_name=container, blob_name=blob) + return content # type: ignore[no-any-return] + + +def open_http_file(conn_id: Optional[str], path: str) -> str: + from airflow.providers.http.hooks.http import HttpHook + + if conn_id is None: + conn_id = "" + + hook = HttpHook(method="GET", http_conn_id=conn_id) + res = hook.run(endpoint=path) + hook.check_response(res) + return res.text # type: ignore[no-any-return] + + +def open_file(path: str) -> str: + """Retrieve a file from http, https, gs, s3, or wasb.""" + conn_id: Optional[str] = conf.get("cosmos", "dbt_docs_conn_id", fallback=None) + + if path.strip().startswith("s3://"): + return open_s3_file(conn_id=conn_id, path=path) + elif path.strip().startswith("gs://"): + return open_gcs_file(conn_id=conn_id, path=path) + elif path.strip().startswith("wasb://"): + return open_azure_file(conn_id=conn_id, path=path) + elif path.strip().startswith("http://") or path.strip().startswith("https://"): + return open_http_file(conn_id=conn_id, path=path) + else: + with open(path) as f: + content = f.read() + return content # type: ignore[no-any-return] + + +iframe_script = """ + +""" + + +class DbtDocsView(AirflowBaseView): + default_view = "dbt_docs" + route_base = "/cosmos" + template_folder = op.join(op.dirname(__file__), "templates") + static_folder = op.join(op.dirname(__file__), "static") + + def create_blueprint( + self, appbuilder: AppBuilder, endpoint: Optional[str] = None, static_folder: Optional[str] = None + ) -> None: + # Make sure the static folder is not overwritten, as we want to use it. + return super().create_blueprint(appbuilder, endpoint=endpoint, static_folder=self.static_folder) # type: ignore[no-any-return] + + @expose("/dbt_docs") # type: ignore[misc] + @has_access([(permissions.ACTION_CAN_READ, permissions.RESOURCE_WEBSITE)]) + def dbt_docs(self) -> str: + if conf.get("cosmos", "dbt_docs_dir", fallback=None) is None: + return self.render_template("dbt_docs_not_set_up.html") # type: ignore[no-any-return,no-untyped-call] + return self.render_template("dbt_docs.html") # type: ignore[no-any-return,no-untyped-call] + + @expose("/dbt_docs_index.html") # type: ignore[misc] + @has_access([(permissions.ACTION_CAN_READ, permissions.RESOURCE_WEBSITE)]) + def dbt_docs_index(self) -> str: + docs_dir = conf.get("cosmos", "dbt_docs_dir", fallback=None) + if docs_dir is None: + abort(404) + html = open_file(op.join(docs_dir, "index.html")) + # Hack the dbt docs to render properly in an iframe + iframe_resizer_url = url_for(".static", filename="iframeResizer.contentWindow.min.js") + html = html.replace("", f'{iframe_script}', 1) + return html + + @expose("/catalog.json") # type: ignore[misc] + @has_access([(permissions.ACTION_CAN_READ, permissions.RESOURCE_WEBSITE)]) + def catalog(self) -> Tuple[str, int, Dict[str, Any]]: + docs_dir = conf.get("cosmos", "dbt_docs_dir", fallback=None) + if docs_dir is None: + abort(404) + data = open_file(op.join(docs_dir, "catalog.json")) + return data, 200, {"Content-Type": "application/json"} + + @expose("/manifest.json") # type: ignore[misc] + @has_access([(permissions.ACTION_CAN_READ, permissions.RESOURCE_WEBSITE)]) + def manifest(self) -> Tuple[str, int, Dict[str, Any]]: + docs_dir = conf.get("cosmos", "dbt_docs_dir", fallback=None) + if docs_dir is None: + abort(404) + data = open_file(op.join(docs_dir, "manifest.json")) + return data, 200, {"Content-Type": "application/json"} + + +dbt_docs_view = DbtDocsView() + + +class CosmosPlugin(AirflowPlugin): + name = "cosmos" + appbuilder_views = [{"name": "dbt Docs", "category": "Browse", "view": dbt_docs_view}] diff --git a/cosmos/plugin/static/iframeResizer.contentWindow.min.js b/cosmos/plugin/static/iframeResizer.contentWindow.min.js new file mode 100644 index 000000000..914161c09 --- /dev/null +++ b/cosmos/plugin/static/iframeResizer.contentWindow.min.js @@ -0,0 +1,9 @@ +/*! iFrame Resizer (iframeSizer.contentWindow.min.js) - v4.3.5 - 2023-03-08 + * Desc: Include this file in any page being loaded into an iframe + * to force the iframe to resize to the content size. + * Requires: iframeResizer.min.js on host page. + * Copyright: (c) 2023 David J. Bradshaw - dave@bradshaw.net + * License: MIT + */ +!function(a){if("undefined"!=typeof window){var r=!0,P="",u=0,c="",s=null,D="",d=!1,j={resize:1,click:1},l=128,q=!0,f=1,n="bodyOffset",m=n,H=!0,W="",h={},g=32,B=null,p=!1,v=!1,y="[iFrameSizer]",J=y.length,w="",U={max:1,min:1,bodyScroll:1,documentElementScroll:1},b="child",V=!0,X=window.parent,T="*",E=0,i=!1,Y=null,O=16,S=1,K="scroll",M=K,Q=window,G=function(){x("onMessage function not defined")},Z=function(){},$=function(){},_={height:function(){return x("Custom height calculation function not defined"),document.documentElement.offsetHeight},width:function(){return x("Custom width calculation function not defined"),document.body.scrollWidth}},ee={},te=!1;try{var ne=Object.create({},{passive:{get:function(){te=!0}}});window.addEventListener("test",ae,ne),window.removeEventListener("test",ae,ne)}catch(e){}var oe,o,I,ie,N,A,C={bodyOffset:function(){return document.body.offsetHeight+ye("marginTop")+ye("marginBottom")},offset:function(){return C.bodyOffset()},bodyScroll:function(){return document.body.scrollHeight},custom:function(){return _.height()},documentElementOffset:function(){return document.documentElement.offsetHeight},documentElementScroll:function(){return document.documentElement.scrollHeight},max:function(){return Math.max.apply(null,e(C))},min:function(){return Math.min.apply(null,e(C))},grow:function(){return C.max()},lowestElement:function(){return Math.max(C.bodyOffset()||C.documentElementOffset(),we("bottom",Te()))},taggedElement:function(){return be("bottom","data-iframe-height")}},z={bodyScroll:function(){return document.body.scrollWidth},bodyOffset:function(){return document.body.offsetWidth},custom:function(){return _.width()},documentElementScroll:function(){return document.documentElement.scrollWidth},documentElementOffset:function(){return document.documentElement.offsetWidth},scroll:function(){return Math.max(z.bodyScroll(),z.documentElementScroll())},max:function(){return Math.max.apply(null,e(z))},min:function(){return Math.min.apply(null,e(z))},rightMostElement:function(){return we("right",Te())},taggedElement:function(){return be("right","data-iframe-width")}},re=(oe=Ee,N=null,A=0,function(){var e=Date.now(),t=O-(e-(A=A||e));return o=this,I=arguments,t<=0||Ok[r]["max"+e])throw new Error("Value for min"+e+" can not be greater than max"+e)}}function h(e,n){null===i&&(i=setTimeout(function(){i=null,e()},n))}function e(){"hidden"!==document.visibilityState&&(O("document","Trigger event: Visibility change"),h(function(){b("Tab Visible","resize")},16))}function b(i,t){Object.keys(k).forEach(function(e){var n;k[n=e]&&"parent"===k[n].resizeFrom&&k[n].autoResize&&!k[n].firstRun&&A(i,t,k[e].iframe,e)})}function y(){F(window,"message",w),F(window,"resize",function(){var e;O("window","Trigger event: "+(e="resize")),h(function(){b("Window "+e,"resize")},16)}),F(document,"visibilitychange",e),F(document,"-webkit-visibilitychange",e)}function n(){function t(e,n){if(n){if(!n.tagName)throw new TypeError("Object is not a valid DOM element");if("IFRAME"!==n.tagName.toUpperCase())throw new TypeError("Expected + +{% endblock %} diff --git a/cosmos/plugin/templates/dbt_docs_not_set_up.html b/cosmos/plugin/templates/dbt_docs_not_set_up.html new file mode 100644 index 000000000..1fcc6ef7f --- /dev/null +++ b/cosmos/plugin/templates/dbt_docs_not_set_up.html @@ -0,0 +1,9 @@ +{% extends base_template %} +{% block content %} +

⚠️ Your dbt docs are not set up yet! ⚠️

+ +

+ Read the Astronomer Cosmos docs for information on how to set up dbt docs. +

+ +{% endblock %} diff --git a/dev/dags/dbt/jaffle_shop/.gitignore b/dev/dags/dbt/jaffle_shop/.gitignore index 49f147cb9..45d294b9a 100644 --- a/dev/dags/dbt/jaffle_shop/.gitignore +++ b/dev/dags/dbt/jaffle_shop/.gitignore @@ -2,3 +2,4 @@ target/ dbt_packages/ logs/ +!target/manifest.json diff --git a/dev/docker-compose.yaml b/dev/docker-compose.yaml index 23b012d15..5345f4b13 100644 --- a/dev/docker-compose.yaml +++ b/dev/docker-compose.yaml @@ -10,6 +10,7 @@ x-airflow-common: environment: &airflow-common-env DB_BACKEND: postgres + AIRFLOW__COSMOS__DBT_DOCS_DIR: http://cosmos-docs.s3-website-us-east-1.amazonaws.com/ AIRFLOW__CORE__EXECUTOR: LocalExecutor AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:pg_password@postgres:5432/airflow AIRFLOW__CORE__FERNET_KEY: '' diff --git a/docs/_static/location_of_dbt_docs_in_airflow.png b/docs/_static/location_of_dbt_docs_in_airflow.png new file mode 100644 index 000000000..348a53c8e Binary files /dev/null and b/docs/_static/location_of_dbt_docs_in_airflow.png differ diff --git a/docs/configuration/generating-docs.rst b/docs/configuration/generating-docs.rst index 6112ebcee..54ec80fc9 100644 --- a/docs/configuration/generating-docs.rst +++ b/docs/configuration/generating-docs.rst @@ -5,7 +5,9 @@ Generating Docs dbt allows you to generate static documentation on your models, tables, and more. You can read more about it in the `official dbt documentation `_. For an example of what the docs look like with the ``jaffle_shop`` project, check out `this site `_. -Many users choose to generate and serve these docs on a static website. This is a great way to share your data models with your team and other stakeholders. +After generating the dbt docs, you can host them natively within Airflow via the Cosmos Airflow plugin; see `Hosting Docs `__ for more information. + +Alternatively, many users choose to serve these docs on a separate static website. This is a great way to share your data models with a broad array of stakeholders. Cosmos offers two pre-built ways of generating and uploading dbt docs and a fallback option to run custom code after the docs are generated: diff --git a/docs/configuration/hosting-docs.rst b/docs/configuration/hosting-docs.rst new file mode 100644 index 000000000..5143a9f67 --- /dev/null +++ b/docs/configuration/hosting-docs.rst @@ -0,0 +1,127 @@ +.. hosting-docs: + +Hosting Docs +============ + +dbt docs can be served directly from the Apache Airflow webserver with the Cosmos Airflow plugin, without requiring the user to set up anything outside of Airflow. This page describes how to host docs in the Airflow webserver directly, although some users may opt to host docs externally. + +Overview +~~~~~~~~ + +The dbt docs are available in the Airflow menu under ``Browse > dbt docs``: + +.. image:: /_static/location_of_dbt_docs_in_airflow.png + :alt: Airflow UI - Location of dbt docs in menu + :align: center + +In order to access the dbt docs, you must specify the following config variables: + +- ``cosmos.dbt_docs_dir``: A path to where the docs are being hosted. +- (Optional) ``cosmos.dbt_docs_conn_id``: A conn ID to use for a cloud storage deployment. If not specified _and_ the URI points to a cloud storage platform, then the default conn ID for the AWS/Azure/GCP hook will be used. + +.. code-block:: cfg + + [cosmos] + dbt_docs_dir = path/to/docs/here + dbt_docs_conn_id = my_conn_id + +or as an environment variable: + +.. code-block:: shell + + AIRFLOW__COSMOS__DBT_DOCS_DIR="path/to/docs/here" + AIRFLOW__COSMOS__DBT_DOCS_CONN_ID="my_conn_id" + +The path can be either a folder in the local file system the webserver is running on, or a URI to a cloud storage platform (S3, GCS, Azure). + +Host from Cloud Storage +~~~~~~~~~~~~~~~~~~~~~~~ + +For typical users, the recommended setup for hosting dbt docs would look like this: + +1. Generate the docs via one of Cosmos' pre-built operators for generating dbt docs (see `Generating Docs `__ for more information) +2. Wherever you dumped the docs, set your ``cosmos.dbt_docs_dir`` to that location. +3. If you want to use a conn ID other than the default connection, set your ``cosmos.dbt_docs_conn_id``. Otherwise, leave this blank. + +AWS S3 Example +^^^^^^^^^^^^^^ + +.. code-block:: cfg + + [cosmos] + dbt_docs_dir = s3://my-bucket/path/to/docs + dbt_docs_conn_id = aws_default + +.. code-block:: shell + + AIRFLOW__COSMOS__DBT_DOCS_DIR="s3://my-bucket/path/to/docs" + AIRFLOW__COSMOS__DBT_DOCS_CONN_ID="aws_default" + +Google Cloud Storage Example +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: cfg + + [cosmos] + dbt_docs_dir = gs://my-bucket/path/to/docs + dbt_docs_conn_id = google_cloud_default + +.. code-block:: shell + + AIRFLOW__COSMOS__DBT_DOCS_DIR="gs://my-bucket/path/to/docs" + AIRFLOW__COSMOS__DBT_DOCS_CONN_ID="google_cloud_default" + +Azure Blob Storage Example +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: cfg + + [cosmos] + dbt_docs_dir = wasb://my-container/path/to/docs + dbt_docs_conn_id = wasb_default + +.. code-block:: shell + + AIRFLOW__COSMOS__DBT_DOCS_DIR="wasb://my-container/path/to/docs" + AIRFLOW__COSMOS__DBT_DOCS_CONN_ID="wasb_default" + +Host from Local Storage +~~~~~~~~~~~~~~~~~~~~~~~ + +By default, Cosmos will not generate docs on the fly. Local storage only works if you are pre-compiling your dbt project before deployment. + +If your Airflow deployment process involves running ``dbt compile``, you will also want to add ``dbt docs generate`` to your deployment process as well to generate all the artifacts necessary to run the dbt docs from local storage. + +By default, dbt docs are generated in the ``target`` folder; so that will also be your docs folder by default. + +For example, if your dbt project directory is ``/usr/local/airflow/dags/my_dbt_project``, then by default your dbt docs directory will be ``/usr/local/airflow/dags/my_dbt_project/target``: + +.. code-block:: cfg + + [cosmos] + dbt_docs_dir = /usr/local/airflow/dags/my_dbt_project/target + +.. code-block:: shell + + AIRFLOW__COSMOS__DBT_DOCS_DIR="/usr/local/airflow/dags/my_dbt_project/target" + +Using docs out of local storage has the downside that some values in the dbt docs can become stale unless the docs are periodically refreshed and redeployed: + +- Counts of the numbers of rows. +- The compiled SQL for incremental models before and after the first run. + +Host from HTTP/HTTPS +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: cfg + + [cosmos] + dbt_docs_dir = https://my-site.com/path/to/docs + +.. code-block:: shell + + AIRFLOW__COSMOS__DBT_DOCS_DIR="https://my-site.com/path/to/docs" + + +You do not need to set a ``dbt_docs_conn_id`` when using HTTP/HTTPS. +If you do set the ``dbt_docs_conn_id``, then the ``HttpHook`` will be used. diff --git a/docs/configuration/index.rst b/docs/configuration/index.rst index 8c282be03..919ed9b1e 100644 --- a/docs/configuration/index.rst +++ b/docs/configuration/index.rst @@ -16,6 +16,7 @@ Cosmos offers a number of configuration options to customize its behavior. For m Parsing Methods Configuring Lineage Generating Docs + Hosting Docs Scheduling Testing Behavior Selecting & Excluding diff --git a/pyproject.toml b/pyproject.toml index 522431da7..7758f9669 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,6 +95,9 @@ azure-container-instance = [ [project.entry-points.cosmos] provider_info = "cosmos:get_provider_info" +[project.entry-points."airflow.plugins"] +cosmos = "cosmos.plugin:CosmosPlugin" + [project.urls] Homepage = "https://github.com/astronomer/astronomer-cosmos" Documentation = "https://astronomer.github.io/astronomer-cosmos" diff --git a/tests/plugin/__init__.py b/tests/plugin/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/plugin/test_plugin.py b/tests/plugin/test_plugin.py new file mode 100644 index 000000000..df33ae13a --- /dev/null +++ b/tests/plugin/test_plugin.py @@ -0,0 +1,223 @@ +# dbt-core relies on Jinja2>3, whereas Flask<2 relies on an incompatible version of Jinja2. +# +# This discrepancy causes the automated integration tests to fail, as dbt-core is installed in the same +# environment as apache-airflow. +# +# We can get around this by patching the jinja2 namespace to include the deprecated objects: +try: + import flask # noqa: F401 +except ImportError: + import markupsafe + import jinja2 + + jinja2.Markup = markupsafe.Markup + jinja2.escape = markupsafe.escape + +from unittest.mock import mock_open, patch, MagicMock, PropertyMock + +import sys +import pytest +from airflow.configuration import conf +from airflow.exceptions import AirflowConfigException +from airflow.utils.db import initdb, resetdb +from airflow.www.app import cached_app +from airflow.www.extensions.init_appbuilder import AirflowAppBuilder +from flask.testing import FlaskClient + +import cosmos.plugin + +from cosmos.plugin import ( + dbt_docs_view, + iframe_script, + open_gcs_file, + open_azure_file, + open_http_file, + open_s3_file, + open_file, +) + + +original_conf_get = conf.get + + +def _get_text_from_response(response) -> str: + # Airflow < 2.4 uses an old version of Werkzeug that does not have Response.text. + if not hasattr(response, "text"): + return response.get_data(as_text=True) + else: + return response.text + + +@pytest.fixture(scope="module") +def app() -> FlaskClient: + initdb() + + app = cached_app(testing=True) + appbuilder: AirflowAppBuilder = app.extensions["appbuilder"] + + appbuilder.sm.check_authorization = lambda *args, **kwargs: True + + if dbt_docs_view not in appbuilder.baseviews: + appbuilder._check_and_init(dbt_docs_view) + appbuilder.register_blueprint(dbt_docs_view) + + yield app.test_client() + + resetdb(skip_init=True) + + +def test_dbt_docs(monkeypatch, app): + def conf_get(section, key, *args, **kwargs): + if section == "cosmos" and key == "dbt_docs_dir": + return "path/to/docs/dir" + else: + return original_conf_get(section, key, *args, **kwargs) + + monkeypatch.setattr(conf, "get", conf_get) + + response = app.get("/cosmos/dbt_docs") + + assert response.status_code == 200 + assert "